In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing

from scipy import stats as st
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error,accuracy_score,f1_score
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from scipy.sparse import hstack
from imblearn.over_sampling import RandomOverSampler

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC


In [2]:
random_state = 42
seed = 5

In [3]:
df = pd.read_csv("./data/iith_foml_2023_train.csv")
df_valid = pd.read_csv("./data/test_input.csv")

In [4]:
df.shape

(994, 25)

In [5]:
df.head()

Unnamed: 0,Feature 1 (Discrete),Feature 2 (Discrete),Feature 3 (Discrete),Feature 4 (Discrete),Feature 5 (Discrete),Feature 6 (Discrete),Feature 7 (Discrete),Feature 8 (Discrete),Feature 9,Feature 10,...,Feature 16,Feature 17,Feature 18,Feature 19 (Discrete),Feature 20 (Discrete),Feature 21 (Discrete),Feature 22 (Discrete),Feature 23 (Discrete),Feature 24,Target Variable (Discrete)
0,1404,12,64,14,3,1,1,1,110.502,35775.2,...,,,15.04,104,12,2,32,1409,37677.1,1
1,909,0,235,32,1,1,1,1,-40.448,35779.4,...,2200.3,4900.005,12.03,20,1,0,13,909,25239.1,1
2,654,3,175,2,1,1,1,1,-27.445,35770.4,...,1973.3,10000.004,13.01,1,1,0,13,654,27683.5,1
3,1372,12,382,14,2,0,1,0,0.001,509.2,...,,,,313,12,10,54,1377,39363.2,0
4,786,3,199,2,1,0,1,0,0.001,612.1,...,,,,171,1,5,11,786,40044.4,2


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994 entries, 0 to 993
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Feature 1 (Discrete)        994 non-null    int64  
 1   Feature 2 (Discrete)        994 non-null    int64  
 2   Feature 3 (Discrete)        994 non-null    int64  
 3   Feature 4 (Discrete)        994 non-null    int64  
 4   Feature 5 (Discrete)        994 non-null    int64  
 5   Feature 6 (Discrete)        994 non-null    int64  
 6   Feature 7 (Discrete)        994 non-null    int64  
 7   Feature 8 (Discrete)        994 non-null    int64  
 8   Feature 9                   980 non-null    float64
 9   Feature 10                  993 non-null    float64
 10  Feature 11                  993 non-null    float64
 11  Feature 12                  993 non-null    float64
 12  Feature 13                  993 non-null    float64
 13  Feature 14                  993 non

In [7]:
class_label_freq_dict = df['Target Variable (Discrete)'].value_counts().to_dict()
total_samples = df.shape[0]
class_weights_dict = {}
for class_label_name,class_freq in class_label_freq_dict.items():
    class_weights_dict[class_label_name] = total_samples/class_label_freq_dict[class_label_name]


print(class_weights_dict)

{1: 2.0368852459016393, 0: 3.9919678714859437, 2: 9.119266055045872, 6: 14.2, 5: 24.24390243902439, 8: 142.0, 14: 198.8, 7: 198.8, 15: 248.5, 4: 331.3333333333333, 13: 331.3333333333333, 3: 331.3333333333333, 9: 497.0, 12: 994.0, 17: 994.0, 11: 994.0, 10: 994.0, 16: 994.0}


In [8]:
df['Target Variable (Discrete)'].value_counts()

Target Variable (Discrete)
1     488
0     249
2     109
6      70
5      41
8       7
14      5
7       5
15      4
4       3
13      3
3       3
9       2
12      1
17      1
11      1
10      1
16      1
Name: count, dtype: int64

In [9]:
def data_preprocessor(X):

    # Identify categorical and numerical columns
    categorical_cols = [cname for cname in X.columns if X[cname].dtype == "int64"]
    numerical_cols = [cname for cname in X.columns if X[cname].dtype == "float64"]

    # Create transformers for categorical and numerical features
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent'))
        ,('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Combine transformers using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    return preprocessor

def add_interaction_features(df):
    # Add interaction features to the DataFrame
    df['Feature9_Feature12'] = df['Feature 9'] * df['Feature 12']
    df['Feature5_Feature18'] = df['Feature 5 (Discrete)'] * df['Feature 18']
    df['Feature13_Feature15'] = df['Feature 13'] / df['Feature 15']
    df['Feature8_Feature24'] = df['Feature 8 (Discrete)'] * df['Feature 24']
    df['Feature10_Feature11'] = df['Feature 10'] / df['Feature 11']
    df['Feature16_Feature17'] = df['Feature 16'] * df['Feature 17']
    return df


In [10]:
df = add_interaction_features(df)
df_valid = add_interaction_features(df_valid)

In [11]:
# Separate features and target variable
X = df.drop("Target Variable (Discrete)", axis=1)
y = df["Target Variable (Discrete)"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

ros = RandomOverSampler(sampling_strategy='auto', random_state=random_state)
print(f"Shape before upsampling: {X_train.shape} : {y_train.shape}")
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
print(f"Shape after upsampling: {X_train_resampled.shape} : {y_train_resampled.shape}")

preprocessor = data_preprocessor(X)
X_train_preprocessed = preprocessor.fit_transform(X_train_resampled)
X_test_preprocessed = preprocessor.transform(X_test)
X_valid_preprocessed = preprocessor.transform(df_valid)


# preprocessor = data_preprocessor(X)
# X_preprocessed = preprocessor.fit_transform(X)
# X_valid_preprocessed = preprocessor.transform(df_valid)

# X_train_preprocessed, X_test_preprocessed, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.1, random_state=random_state)

# poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

# # Fit and transform on training data
# X_poly = poly.fit_transform(X_preprocessed)

# # Transform validation data
# X_valid_poly = poly.transform(X_valid_preprocessed)

# # Convert PolynomialFeatures output to sparse matrix and concatenate with original data
# X_poly_sparse = hstack([X_preprocessed, X_poly[:, X_preprocessed.shape[1]:]])
# X_valid_preprocessed = hstack([X_valid_preprocessed, X_valid_poly[:, X_valid_preprocessed.shape[1]:]])

# # Split the data into training and testing sets
# X_train_preprocessed, X_test_preprocessed, y_train, y_test = train_test_split(X_poly_sparse, y, test_size=0.1, random_state=random_state)

Shape before upsampling: (894, 30) : (894,)
Shape after upsampling: (7497, 30) : (7497,)


In [12]:
X_train_preprocessed.shape, X_test_preprocessed.shape, X_valid_preprocessed.shape

((7497, 2674), (100, 2674), (426, 2674))

In [13]:
def evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        'Stochastic Gradient Descent' : SGDClassifier(random_state=random_state),
        'Random Forest': RandomForestClassifier(n_estimators=1000, random_state=random_state, class_weight=class_weights_dict),
        'Decsision Tree': DecisionTreeClassifier(random_state=random_state),
        'AdaBoost': AdaBoostClassifier(random_state=random_state),
        'Support Vector Machine': SVC(C=1.0, kernel='rbf', gamma='scale', probability=True, random_state=random_state)
    }

    results = {}

    for model_name, model in models.items():
        # Create a new pipeline with the model
        model_pipeline = Pipeline(steps=[('model', model)])

        # Fit the pipeline on the training data
        model_pipeline.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = model_pipeline.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')

        # Store the model and results in a dictionary
        results[model_name] = {
            'model': model_pipeline,
            'metrics': {
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1
            }
        }

        # Display results
        print(f"\nResults for {model_name}:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

    return results

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

def evaluate_models_cross(X_train, y_train):
    models = {
        'Stochastic Gradient Descent' : SGDClassifier(random_state=random_state, loss='log'),
        'Random Forest': RandomForestClassifier(n_estimators=1000, random_state=random_state),
        'Decsision Tree': DecisionTreeClassifier(random_state=random_state),
        'AdaBoost': AdaBoostClassifier(random_state=random_state),
        'Support Vector Machine': SVC(C=1.0, kernel='rbf', gamma='scale', probability=True, random_state=random_state)
    }

    results = {}

    for model_name, model in models.items():
        
        model.fit(X_train, y_train)
        
        # Perform cross-validation
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)  # Adjust the number of splits as needed
        cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')

        # Make predictions on the test set
        y_pred_train = model.predict(X_train)

        # Evaluate the model
        accuracy = accuracy_score(y_train, y_pred_train)
        precision = precision_score(y_train, y_pred_train, average='macro')
        recall = recall_score(y_train, y_pred_train, average='macro')
        f1 = f1_score(y_train, y_pred_train, average='macro')
        

        # Store the model and results in a dictionary
        results[model_name] = {
            'model': model,
            'metrics': {
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1,
                'CV Mean Accuracy': cv_scores.mean(),  # Mean cross-validation accuracy
                'CV Std Accuracy': cv_scores.std()  # Standard deviation of cross-validation accuracy
            }
        }

        # Display results
        print(f"\nResults for {model_name}:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"CV Mean Accuracy: {cv_scores.mean():.4f}")
        print(f"CV Std Accuracy: {cv_scores.std():.4f}")

    return results

In [15]:
# Assuming 'your_data' is the variable containing your DataFrame
preprocessor = data_preprocessor(X)
results = evaluate_models(X_train_preprocessed, X_test_preprocessed, y_train_resampled, y_test)
#results = evaluate_models_cross(X_preprocessed, y)


Results for Stochastic Gradient Descent:
Accuracy: 0.8500
Precision: 0.3966
Recall: 0.4033
F1 Score: 0.3983


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Results for Random Forest:
Accuracy: 0.8800
Precision: 0.4725
Recall: 0.4924
F1 Score: 0.4795

Results for Decsision Tree:
Accuracy: 0.8000
Precision: 0.3517
Recall: 0.3624
F1 Score: 0.3564


  _warn_prf(average, modifier, msg_start, len(result))



Results for AdaBoost:
Accuracy: 0.1600
Precision: 0.0145
Recall: 0.0909
F1 Score: 0.0251

Results for Support Vector Machine:
Accuracy: 0.8800
Precision: 0.4779
Recall: 0.5094
F1 Score: 0.4918


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
y_pred_val_final = results['Support Vector Machine']['model'].predict(X_valid_preprocessed)

In [17]:
df_valid['Category'] = y_pred_val_final

In [18]:
data_submission_final = pd.DataFrame()

index_list = df_valid.index.tolist()

index_list = [x + 1 for x in index_list]
data_submission_final['Id'] = index_list
data_submission_final['Category'] = df_valid['Category']

In [19]:
data_submission_final.to_csv("test_output.csv",index=False)