In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv('stock_analysis_with_date.csv')
label_encoder = LabelEncoder()
df['Recommendation'] = label_encoder.fit_transform(df['Recommendation'])
df['Duration'] = label_encoder.fit_transform(df['Duration'].astype(str))
df['Stock Movement'] = df['Stock Movement'].map({'Upward': 1, 'Downward': 0})


In [3]:
df.head()

Unnamed: 0,Stock Name,Symbol,Recommendation,CMP,Stop Loss,Target,Duration,Sentiment,Date,Stock Movement
0,Narayana Hrudayalaya Ltd,NH,0,1331.0,1275.0,1480.0,3,0.0,2024-12-06 09:29:32+00:00,1
1,Affle India Ltd,AFFLE,0,1780.0,1425.0,,4,-0.0125,2024-12-06 06:12:38+00:00,0
2,BLS International Services Ltd,BLS,0,426.0,341.0,,4,-0.010714,2024-12-06 05:52:41+00:00,0
3,PNC Infratech Ltd,PNCINFRA,0,309.0,295.0,340.0,3,0.0,2024-12-05 08:17:03+00:00,1
4,UCO Bank Ltd,UCOBANK,0,49.0,39.0,,0,-0.027778,2024-12-05 06:18:46+00:00,0


In [16]:
null_counts = df.isnull().sum()
print(null_counts)

Stock Name         0
Symbol             0
Recommendation     0
CMP                0
Stop Loss          7
Target            57
Duration           0
Sentiment          0
Date               0
Stock Movement     0
date_column        0
year               0
month              0
day                0
hour               0
minute             0
second             0
dtype: int64


In [17]:

X = df[['Recommendation', 'Date', 'CMP', 'Stop Loss', 'Duration', 'Sentiment']]
y = df['Stock Movement']


In [5]:
df['date_column'] = pd.to_datetime(df['Date'])
df['year'] = df['date_column'].dt.year
df['month'] = df['date_column'].dt.month
df['day'] = df['date_column'].dt.day
df['hour'] = df['date_column'].dt.hour
df['minute'] = df['date_column'].dt.minute
df['second'] = df['date_column'].dt.second

In [18]:
df.head()

Unnamed: 0,Stock Name,Symbol,Recommendation,CMP,Stop Loss,Target,Duration,Sentiment,Date,Stock Movement,date_column,year,month,day,hour,minute,second
0,Narayana Hrudayalaya Ltd,NH,0,1331.0,1275.0,1480.0,3,0.0,2024-12-06 09:29:32+00:00,1,2024-12-06 09:29:32+00:00,2024,12,6,9,29,32
1,Affle India Ltd,AFFLE,0,1780.0,1425.0,,4,-0.0125,2024-12-06 06:12:38+00:00,0,2024-12-06 06:12:38+00:00,2024,12,6,6,12,38
2,BLS International Services Ltd,BLS,0,426.0,341.0,,4,-0.010714,2024-12-06 05:52:41+00:00,0,2024-12-06 05:52:41+00:00,2024,12,6,5,52,41
3,PNC Infratech Ltd,PNCINFRA,0,309.0,295.0,340.0,3,0.0,2024-12-05 08:17:03+00:00,1,2024-12-05 08:17:03+00:00,2024,12,5,8,17,3
4,UCO Bank Ltd,UCOBANK,0,49.0,39.0,,0,-0.027778,2024-12-05 06:18:46+00:00,0,2024-12-05 06:18:46+00:00,2024,12,5,6,18,46


In [46]:
df['Stop Loss'] = df['Stop Loss'].fillna(0)

In [47]:
X = df[['Recommendation', 'year', 'month', 'day', 'hour', 'minute', 'CMP', 'Stop Loss', 'Duration', 'Sentiment']]
y = df['Stock Movement']


In [48]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [49]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Classification Report: \n{classification_report(y_test, y_pred)}')

Accuracy: 1.0
Confusion Matrix: 
[[10  0]
 [ 0 64]]
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        64

    accuracy                           1.00        74
   macro avg       1.00      1.00      1.00        74
weighted avg       1.00      1.00      1.00        74



In [50]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix: \n{conf_matrix}')

Confusion Matrix: 
[[10  0]
 [ 0 64]]


In [51]:
from sklearn.metrics import precision_score

precision = precision_score(y_test, y_pred, average='binary')  # Use 'micro', 'macro', or 'weighted' for multi-class
print(f'Precision: {precision}')

Precision: 1.0


In [52]:
from sklearn.metrics import recall_score

recall = recall_score(y_test, y_pred, average='binary')  
print(f'Recall: {recall}')

Recall: 1.0


In [53]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_pred, average='binary') 
print(f'F1-Score: {f1}')

F1-Score: 1.0


Trying ensemble and grid search to find which hyper parameters can be useful.

In [54]:

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


In [55]:
nan_counts = df.isnull().sum()
print("NaN counts per column:\n", nan_counts[nan_counts > 0])
# From the output I can see that we have null values in the stop loss too so what I will do is remove the nan values with 0.

NaN counts per column:
 Target    57
dtype: int64


In [56]:

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7],
}

param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
}

param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan'],
}

param_grid_dt = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

param_grid_nb = {
    'var_smoothing': [1e-9, 1e-8, 1e-7]
}

# Define the models
models = {
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB()
}

param_grids = {
    'Random Forest': param_grid_rf,
    'Gradient Boosting': param_grid_gb,
    'SVM': param_grid_svm,
    'KNN': param_grid_knn,
    'Decision Tree': param_grid_dt,
    'Naive Bayes': param_grid_nb
}

best_estimators = {}
for model_name, model in models.items():
    print(f"Performing GridSearch for {model_name}...")
    grid_search = GridSearchCV(model, param_grids[model_name], cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_estimators[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")


Performing GridSearch for Random Forest...
Best parameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Performing GridSearch for Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Performing GridSearch for SVM...
Best parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Performing GridSearch for KNN...
Best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Performing GridSearch for Decision Tree...
Best parameters for Decision Tree: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Performing GridSearch for Naive Bayes...
Best parameters for Naive Bayes: {'var_smoothing': 1e-09}


Stacking Classifier


Stacking involves training a meta-model on the outputs of base models.

In [60]:

stacking_clf = StackingClassifier(
    estimators=[('rf', best_estimators['Random Forest']),
                ('gb', best_estimators['Gradient Boosting']),
                ('svm', best_estimators['SVM']),
                ('knn', best_estimators['KNN']),
                ('dt', best_estimators['Decision Tree']),
                ('nb', best_estimators['Naive Bayes'])],
    final_estimator=LogisticRegression()
)


stacking_clf.fit(X_train, y_train)


y_pred_stacking = stacking_clf.predict(X_test)
print(f"Stacking Classifier Accuracy: {accuracy_score(y_test, y_pred_stacking)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred_stacking)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred_stacking)}")


Stacking Classifier Accuracy: 1.0
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        64

    accuracy                           1.00        74
   macro avg       1.00      1.00      1.00        74
weighted avg       1.00      1.00      1.00        74

Confusion Matrix: 
[[10  0]
 [ 0 64]]


Model Comparison

In [62]:
for model_name, model in best_estimators.items():
    y_pred = model.predict(X_test)
    print(f"{model_name} Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report: \n{classification_report(y_test, y_pred)}")
    print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")
    print("-" * 50)


print("Stacking Classifier Evaluation:")
print(f"Stacking Classifier Accuracy: {accuracy_score(y_test, y_pred_stacking)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred_stacking)}")


Random Forest Accuracy: 1.0
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        64

    accuracy                           1.00        74
   macro avg       1.00      1.00      1.00        74
weighted avg       1.00      1.00      1.00        74

Confusion Matrix: 
[[10  0]
 [ 0 64]]
--------------------------------------------------
Gradient Boosting Accuracy: 1.0
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        64

    accuracy                           1.00        74
   macro avg       1.00      1.00      1.00        74
weighted avg       1.00      1.00      1.00        74

Confusion Matrix: 
[[10  0]
 [ 0 64]]
--------------------------------------------------
SVM Accuracy: 1.0
Classification Report: 
              prec