In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
import pickle

In [None]:
train_df = pd.read_csv('data/train_data_set.csv')
print(train_df.head)

In [None]:
print(train_df.info())

In [None]:
print(train_df.describe())
# mean is around 0.437 that means there are almost equal no of 1 and 0's, so it's not imbalanced classes rather very balanced.
# just judging based on accuracy shd be fine, no need of F1 score anol for balanced classes.

In [None]:
pearson_corr = train_df.corr(method='pearson')['blue_win'].drop('blue_win')

# Calculate Spearman correlation
spearman_corr = train_df.corr(method='spearman')['blue_win'].drop('blue_win')

# Combine the results into a DataFrame for easy comparison
correlation_df = pd.DataFrame({
    'Pearson': pearson_corr,
    'Spearman': spearman_corr
})

# Sort by absolute values of Pearson correlation for better readability
correlation_df = correlation_df.reindex(correlation_df['Pearson'].abs().sort_values(ascending=False).index)

print(correlation_df)

In [None]:
plt.figure(figsize=(14, 8))

# Plot Pearson correlations
sns.barplot(x=correlation_df.index, y=correlation_df['Pearson'], color='blue', alpha=0.6, label='Pearson')

# Plot Spearman correlations
sns.barplot(x=correlation_df.index, y=correlation_df['Spearman'], color='red', alpha=0.6, label='Spearman')

# Add labels and title
plt.xlabel('Features')
plt.ylabel('Correlation with blue_win')
plt.title('Pearson and Spearman Correlations with blue_win')
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
X = train_df.drop(columns=['blue_win'])
y = train_df['blue_win']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
print(X_scaled)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'k-NN': KNeighborsClassifier()
}

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

model_scores = {}
for model_name, model in models.items():
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
    f1_scores = cross_val_score(model, X, y, cv=skf, scoring='f1')
    model_scores[model_name] = {
        'accuracy_mean': np.mean(accuracy_scores),
        'accuracy_std': np.std(accuracy_scores),
    }

# Create a DataFrame to display the results
model_scores_df = pd.DataFrame(model_scores).T
model_scores_df = model_scores_df[['accuracy_mean', 'accuracy_std']]

# Sort by F1-score for better readability
model_scores_df = model_scores_df.sort_values(by='accuracy_mean', ascending=False)

print(model_scores_df)

In [None]:
best_model_name = model_scores_df.index[0]
best_model_accuracy = model_scores_df['accuracy_mean'][0]
print(best_model_name, best_model_accuracy)

In [None]:
models = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'param_grid': {
            'penalty': ['l1', 'l2', 'elasticnet', 'none'],
            'C': [0.01, 0.1, 1, 10, 100],
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'param_grid': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(),
        'param_grid': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1.0],
            'max_depth': [3, 5, 7],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'SVM': {
        'model': SVC(),
        'param_grid': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'gamma': ['scale', 'auto'],
            'degree': [3, 5]
        }
    },
    'k-NN': {
        'model': KNeighborsClassifier(),
        'param_grid': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
        }
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(),
        'param_grid': {
            'criterion': ['gini', 'entropy'],
            'splitter': ['best', 'random'],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    }
}

best_model = models[best_model_name]['model']
param_grid = models[best_model_name]['param_grid']

# Initialize GridSearchCV with the best model and its parameter grid
grid_search = GridSearchCV(best_model, param_grid, cv=skf, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV to obtain the best parameters
grid_search.fit(X, y)

# Get the best parameters
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters
print("Best Parameters for", best_model_name, ":", best_params)
print("Best Accuracy:", best_score)

In [None]:
best_model_info = {
    'model_name': best_model_name,
    'best_params': best_params,
    'best_accuracy': best_score
}

# Store best_model_info in a binary file using pickle
with open('best_model_info.pkl', 'wb') as f:
    pickle.dump(best_model_info, f)


In [None]:
import pickle

# Load best_model_info from the binary file
with open('best_model_info.pkl', 'rb') as f:
    best_model_info = pickle.load(f)

# Now best_model_info contains your stored information
print(best_model_info)

In [None]:
# Load test data
test_df = pd.read_csv('chumma.csv')

# Ensure the test DataFrame has the same structure as the training DataFrame
X_test = test_df.drop(columns=['blue_win'])
y_test = test_df['blue_win']

# Assuming X_train was scaled with StandardScaler during training
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # This is for reference, during training

# Scale the test data
X_test_scaled = scaler.transform(X_test)

In [None]:
best_model_name = best_model_info['model_name']
best_params = best_model_info['best_params']

# Initialize the model with the best parameters
if best_model_name == 'Logistic Regression':
    best_model = LogisticRegression(**best_params)
elif best_model_name == 'Random Forest':
    best_model = RandomForestClassifier(**best_params)
elif best_model_name == 'SVM':
    best_model = SVC(**best_params)
elif best_model_name == 'Gradient Boosting':
    best_model = GradientBoostingClassifier(**best_params)
elif best_model_name == 'Decision Tree':
    best_model = DecisionTreeClassifier(**best_params)
elif best_model_name == 'k-NN':
    best_model = KNeighborsClassifier(**best_params)
else:
    raise ValueError(f"Unsupported model: {best_model_name}")

# Fit the model on the entire training set (as the best parameters were found using cross-validation)
best_model.fit(X, y)

In [None]:
# Predict the test set results
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {test_accuracy}")