In [28]:
import pandas as pd
df = pd.read_csv('/content/Netflix_stock_data.csv')

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5817 entries, 0 to 5816
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    5817 non-null   object 
 1   Close   5817 non-null   float64
 2   High    5817 non-null   float64
 3   Low     5817 non-null   float64
 4   Open    5817 non-null   float64
 5   Volume  5817 non-null   int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 272.8+ KB


Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2002-05-23,1.196429,1.242857,1.145714,1.156429,104790000
1,2002-05-24,1.21,1.225,1.197143,1.214286,11104800
2,2002-05-28,1.157143,1.232143,1.157143,1.213571,6609400
3,2002-05-29,1.103571,1.164286,1.085714,1.164286,6757800
4,2002-05-30,1.071429,1.107857,1.071429,1.107857,10154200


In [29]:
df['Date'] = pd.to_datetime(df['Date'])


In [30]:
df = df.sort_values(by='Date').reset_index(drop=True)

In [31]:
df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)

In [32]:
df = df[:-1]


In [33]:
class_balance = df['Target'].value_counts()


In [34]:
df.head(), class_balance

(        Date     Close      High       Low      Open     Volume  Target
 0 2002-05-23  1.196429  1.242857  1.145714  1.156429  104790000       1
 1 2002-05-24  1.210000  1.225000  1.197143  1.214286   11104800       0
 2 2002-05-28  1.157143  1.232143  1.157143  1.213571    6609400       0
 3 2002-05-29  1.103571  1.164286  1.085714  1.164286    6757800       0
 4 2002-05-30  1.071429  1.107857  1.071429  1.107857   10154200       1,
 Target
 1    2951
 0    2865
 Name: count, dtype: int64)

In [35]:
#splitting training and testing data
split_idx = int(len(df) * 0.7)

In [36]:
train_df = df.iloc[:split_idx].copy()
test_df = df.iloc[split_idx:].copy()

In [37]:
train_df.shape, test_df.shape, train_df['Date'].min(), train_df['Date'].max(), test_df['Date'].min(), test_df['Date'].max()

((4071, 7),
 (1745, 7),
 Timestamp('2002-05-23 00:00:00'),
 Timestamp('2018-07-24 00:00:00'),
 Timestamp('2018-07-25 00:00:00'),
 Timestamp('2025-07-03 00:00:00'))

## Feature Engineering


In [38]:
def add_features(df):
    df = df.copy()

    # Existing lags
    df['Close_Lag1'] = df['Close'].shift(1)
    df['Close_Lag2'] = df['Close'].shift(2)
    df['Close_Lag3'] = df['Close'].shift(3)

    df['High_Lag1'] = df['High'].shift(1)
    df['Low_Lag1'] = df['Low'].shift(1)
    df['Volume_Lag1'] = df['Volume'].shift(1)

    # Moving Averages
    df['MA5'] = df['Close'].rolling(window=5).mean()
    df['MA10'] = df['Close'].rolling(window=10).mean()
    df['MA20'] = df['Close'].rolling(window=20).mean()

    # Volatility
    df['Volatility5'] = df['Close'].rolling(window=5).std()

    # Return and lagged return
    df['Return'] = df['Close'].pct_change()
    df['Return_Lag1'] = df['Return'].shift(1)

    # RSI
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    RS = gain / loss
    df['RSI14'] = 100 - (100 / (1 + RS))

    # Stronger target label (optional!)
    df['Daily_Return'] = df['Close'].pct_change().shift(-1)
    df['Target'] = np.where(df['Daily_Return'] > 0.005, 1, 0)

    return df


In [41]:
#IMPORTS
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

#SELECT FEATURES
feature_cols = [
    'Close_Lag1', 'Close_Lag2', 'Close_Lag3',
    'High_Lag1', 'Low_Lag1', 'Volume_Lag1',
    'MA5', 'MA10', 'MA20',
    'Volatility5', 'Return', 'Return_Lag1', 'RSI14'
]

target_col = 'Target'


train_df = add_features(train_df)
test_df = add_features(test_df)

train_df = train_df.dropna().reset_index(drop=True)
test_df = test_df.dropna().reset_index(drop=True)


X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_test = test_df[feature_cols]
y_test = test_df[target_col]

#SCALE FEATURES
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#DEFINE MODELS
models = {
    'LogisticRegression': LogisticRegression(),
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(),
    'XGBoost': XGBClassifier(eval_metric='logloss')
}

#DEFINE HYPERPARAMETER GRIDS
param_grids = {
    'LogisticRegression': {'C': [0.01, 0.1, 1, 10]},
    'RandomForest': {'n_estimators': [50, 100], 'max_depth': [5, 10, None]},
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'XGBoost': {'n_estimators': [50, 100], 'max_depth': [3, 5, 7]}
}

#TIME SERIES SPLIT for CV
tscv = TimeSeriesSplit(n_splits=5)

#TRAIN, TUNE, AND EVALUATE
results = []

for name, model in models.items():
    print(f"Training {name}...")
    param_grid = param_grids[name]

    # Use GridSearchCV
    grid = GridSearchCV(model, param_grid, cv=tscv, scoring='accuracy')
    grid.fit(X_train_scaled, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"{name} - Best Params: {grid.best_params_}")
    print(f"{name} - Test Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-"*40)

    results.append({
        'Model': name,
        'Best_Params': grid.best_params_,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1': f1
    })

#COMPARE RESULTS
results_df = pd.DataFrame(results)
print("\nFinal Comparison:")
print(results_df)


Training LogisticRegression...
LogisticRegression - Best Params: {'C': 0.01}
LogisticRegression - Test Accuracy: 0.5840
Precision: 0.1250, Recall: 0.0014, F1: 0.0028
Confusion Matrix:
[[983   7]
 [694   1]]
----------------------------------------
Training RandomForest...
RandomForest - Best Params: {'max_depth': 5, 'n_estimators': 100}
RandomForest - Test Accuracy: 0.5276
Precision: 0.3980, Recall: 0.2835, F1: 0.3311
Confusion Matrix:
[[692 298]
 [498 197]]
----------------------------------------
Training SVM...
SVM - Best Params: {'C': 1, 'kernel': 'rbf'}
SVM - Test Accuracy: 0.5881
Precision: 0.5294, Recall: 0.0129, F1: 0.0253
Confusion Matrix:
[[982   8]
 [686   9]]
----------------------------------------
Training XGBoost...
XGBoost - Best Params: {'max_depth': 3, 'n_estimators': 50}
XGBoost - Test Accuracy: 0.4813
Precision: 0.4181, Recall: 0.6576, F1: 0.5112
Confusion Matrix:
[[354 636]
 [238 457]]
----------------------------------------

Final Comparison:
                Mode