In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Load the data
data1 = pd.read_csv('/content/drive/MyDrive/data/training1.csv')
data2 = pd.read_csv('/content/drive/MyDrive/data/training2.csv')
test_data = pd.read_csv('/content/drive/MyDrive/data/test.csv')

print("Training Data 1 shape:", data1.shape)
print("Training Data 2 shape:", data2.shape)
print("Test Data shape:", test_data.shape)

Training Data 1 shape: (400, 3458)
Training Data 2 shape: (2750, 3458)
Test Data shape: (1000, 3456)


In [4]:
# Identify the columns in training and test data
training_columns = data1.columns[:-2]
test_columns = test_data.columns[:-1]

In [5]:
# Find the missing columns in test data
missing_cols = set(training_columns) - set(test_columns)

In [6]:
for col in missing_cols:
    if col in data1.columns:
        col_mean = np.mean(np.concatenate((data1[col], data2[col].fillna(data2[col].mean()))))
        test_data[col] = col_mean

# Ensure all columns are aligned
test_data = test_data[training_columns]


In [7]:
# Separate features and labels for data1 and data2
X_data1 = data1.iloc[:, :-2].values
y_data1 = data1['label'].values
conf_data1 = data1['confidence'].values

X_data2 = data2.iloc[:, :-2].values
y_data2 = data2['label'].values
conf_data2 = data2['confidence'].values

X_combined_train = np.vstack((X_data1, X_data2))
y_combined = np.hstack((y_data1, y_data2))
conf_combined = np.hstack((conf_data1, conf_data2))

In [8]:
# Apply KNN imputation on combined training data
knn_imputer = KNNImputer(n_neighbors=5)
X_combined_train_imputed = knn_imputer.fit_transform(X_combined_train)

In [9]:
# Apply KNN imputation on test data
X_test = test_data.values
X_test_imputed = knn_imputer.transform(X_test)

In [10]:
print("Shape of imputed combined training data:", X_combined_train_imputed.shape)
print("Shape of imputed test data:", X_test_imputed.shape)

Shape of imputed combined training data: (3150, 3456)
Shape of imputed test data: (1000, 3456)


In [12]:
scaler = StandardScaler()
X_combined_train_scaled = scaler.fit_transform(X_combined_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

estimator = RandomForestClassifier(n_estimators=100, random_state=42)
selector = RFE(estimator, n_features_to_select=50, step=10)
selector = selector.fit(X_combined_train_scaled, y_combined)

KeyboardInterrupt: 

In [None]:
X_combined_train_selected = selector.transform(X_combined_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

y_combined = np.hstack((y_data1, y_data2))
conf_combined = np.hstack((conf_data1, conf_data2))

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_combined_train_selected, y_combined, test_size=0.2, random_state=42)

# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'MLP': MLPClassifier(max_iter=500)
}

In [None]:
# Cross-validation
cv_results = {}
for name, clf in classifiers.items():
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    cv_results[name] = scores
    print(f'{name} - CV Scores: {scores}')

In [None]:
# Calculate the mean and standard deviation of cross-validation scores
cv_means = {name: np.mean(scores) for name, scores in cv_results.items()}
cv_stds = {name: np.std(scores) for name, scores in cv_results.items()}

# Create a DataFrame for plotting
cv_df = pd.DataFrame({'Classifier': list(cv_means.keys()), 'Mean CV Score': list(cv_means.values()), 'CV Std Dev': list(cv_stds.values())})

# Plot the bar plot with error bars
plt.figure(figsize=(12, 8))
sns.barplot(x='Classifier', y='Mean CV Score', data=cv_df, capsize=0.2, palette='viridis', ci=None)
plt.errorbar(x=cv_df['Classifier'], y=cv_df['Mean CV Score'], yerr=cv_df['CV Std Dev'], fmt='none', c='black', capsize=5)
plt.title('Cross-validation Scores for Different Classifiers')
plt.xlabel('Classifier')
plt.ylabel('Mean CV Score')
plt.ylim(0.6, 0.8)
plt.show()

In [None]:
# Handling Confidence Labels by training with sample weights
best_clf = RandomForestClassifier()
best_clf.fit(X_train, y_train, sample_weight=conf_combined[:X_train.shape[0]])

# Predictions on the validation set
val_predictions = best_clf.predict(X_val)

# Evaluate model
accuracy = accuracy_score(y_val, val_predictions)
precision = precision_score(y_val, val_predictions)
recall = recall_score(y_val, val_predictions)
f1 = f1_score(y_val, val_predictions)

print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-score: {f1}')


In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_val, val_predictions)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Compute ROC curve and AUC for the best model
y_val_prob = best_clf.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Predictions on the test set
test_predictions = best_clf.predict(X_test_selected)

# Create an ID column if it doesn't exist in test data
if 'id' not in test_data.columns:
    test_data['id'] = range(1, len(test_data) + 1)

# Save predictions to CSV
submission = pd.DataFrame({'id': test_data['id'], 'label': test_predictions})
submission.to_csv('predictions.csv', index=False)