In [None]:
# pip install xgboost

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [None]:
from sklearn.utils.class_weight import compute_class_weight

In [None]:
# !pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import ADASYN

In [None]:
# !pip install tslearn

In [None]:
from tslearn.clustering import TimeSeriesKMeans

In [None]:
import xgboost as xgb
from sklearn.ensemble import VotingClassifier

# Load the data
data = pd.read_csv('EDA4_nan820.csv')
X = data.drop(['trending', 'Time'], axis=1)
y = data['trending']

# Split the data at line 820
split_index = 820
X_train = X.iloc[:split_index]
X_test = X.iloc[split_index:]
y_train = y.iloc[:split_index]
y_test = y.iloc[split_index:]

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply Polynomial Feature Transformation
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# Resample the training data using ADASYN
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_poly, y_train)

# Train RandomForest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3)
grid_search_rf.fit(X_train_adasyn, y_train_adasyn)
best_rf = grid_search_rf.best_estimator_

# Train XGBoost with custom loss function for trend length
def custom_loss(y_true, y_pred):
    # Calculate the difference between consecutive predictions
    diff = np.diff(y_pred)
    # Penalize short trends
    penalty = np.sum(np.abs(diff))
    # Standard log loss
    log_loss = -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return log_loss + penalty

param_grid_xgb = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'alpha': [0, 0.1, 1],
    'lambda': [0, 0.1, 1]
}
grid_search_xgb = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42), param_grid_xgb, cv=3)
grid_search_xgb.fit(X_train_adasyn, y_train_adasyn)
best_xgb = grid_search_xgb.best_estimator_

# Ensemble the models using VotingClassifier
ensemble_model = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='soft')
ensemble_model.fit(X_train_adasyn, y_train_adasyn)

# Predictions from the ensemble model
y_train_pred_ensemble = ensemble_model.predict(X_train_poly)

In [None]:
# Preprocess the test data
X_test_scaled = scaler.transform(X_test)
X_test_poly = poly.transform(X_test_scaled)

# Predict using the ensemble model
y_test_pred_ensemble = ensemble_model.predict(X_test_poly)

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('Corrected_Predictions_nan1190.csv')
X = data.drop(['Corrected_Predictions', 'Time', 'trending'], axis=1)
y = data['Corrected_Predictions']

# Split the data at line 1190
split_index = 1190
X_train = X.iloc[:split_index]
y_train = y.iloc[:split_index]

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Apply Polynomial Feature Transformation
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_scaled)

# Resample the training data using ADASYN
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_poly, y_train)

# Train RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_adasyn, y_train_adasyn)

# Train XGBoost
def custom_loss(y_pred, dtrain):
    y_true = dtrain.get_label()
    grad = np.where(y_true == 1, -2*(1-y_pred), 2*y_pred)
    hess = np.where(y_true == 1, 2*y_pred, 2*(1-y_pred))
    return grad, hess

dtrain = xgb.DMatrix(X_train_adasyn, label=y_train_adasyn)
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 6,
    'eta': 0.1,
    'alpha': 1,
    'lambda': 1
}
bst = xgb.train(params, dtrain, num_boost_round=1000, obj=custom_loss)

# Ensemble predictions
class EnsembleModel:
    def __init__(self, model1, model2):
        self.model1 = model1
        self.model2 = model2

    def predict(self, X):
        # Predict trend length with XGBoost
        trend_length_pred = self.model2.predict(xgb.DMatrix(X))
        # Use trend length prediction to adjust RandomForest predictions
        rf_pred = self.model1.predict(X)
        adjusted_pred = np.where(trend_length_pred > 0.5, rf_pred, 0)
        return adjusted_pred

ensemble_model = EnsembleModel(rf_clf, bst)
y_train_pred_ensemble = ensemble_model.predict(X_train_poly)

# Calculate metrics
accuracy = accuracy_score(y_train, y_train_pred_ensemble)
precision = precision_score(y_train, y_train_pred_ensemble)
recall = recall_score(y_train, y_train_pred_ensemble)
f1 = f1_score(y_train, y_train_pred_ensemble)

accuracy, precision, recall, f1

In [None]:
# Check the number of non-NaN and NaN values in y_test
non_nan_count = y_test.dropna().shape[0]
nan_count = y_test.isna().sum()
non_nan_count, nan_count

In [None]:
# Check the shape and first few rows of X_test
X_test.shape, X_test.head()

In [None]:
# Load the data from 'Corrected_Predictions_nan1190.csv'
data_corrected = pd.read_csv('Corrected_Predictions_nan1190.csv')
X_corrected = data_corrected.drop(['Corrected_Predictions', 'Time', 'trending'], axis=1)
y_corrected = data_corrected['Corrected_Predictions']
# Extract rows with NaN values in y_corrected for prediction
X_missing_corrected = X_corrected[y_corrected.isna()]
# Predict values for the missing data
y_missing_pred_corrected = ensemble_model.predict(poly.transform(scaler.transform(X_missing_corrected)))
y_missing_pred_corrected