# CS253 PYTHON ASSIGNMENT
This notebook contains experimentation with CTGAN library. The library is used to generate synthetic data. The data is generated based on the input data.

## IMPORTING LIBRARIES AND CREATION OF DATAFRAME

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import sklearn.preprocessing
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv('../train.csv')

In [None]:
def convert_to_numeric(value):
    if isinstance(value, str):
        if 'Crore' in value:
            numeric_value = float(value.replace(' Crore+', '')) * 10000000
        elif 'Lac' in value:
            numeric_value = float(value.replace(' Lac+', '')) * 100000
        elif 'Thou' in value:
            numeric_value = float(value.replace(' Thou+', '')) * 1000
        elif 'Hund' in value:
            numeric_value = float(value.replace(' Hund+', '')) * 100
        else:
            numeric_value = float(value)
    else:
        numeric_value = float(value)
    return numeric_value

In [None]:
df['Total Assets'] = df['Total Assets'].apply(convert_to_numeric)
df['Liabilities'] = df['Liabilities'].apply(convert_to_numeric)
le=sklearn.preprocessing.LabelEncoder()
df['state']=le.fit_transform(df['state'])
le_1=sklearn.preprocessing.LabelEncoder()
df['Party']=le_1.fit_transform(df['Party'])
le_2=sklearn.preprocessing.LabelEncoder()
df['Education']=le_2.fit_transform(df['Education'])
df=df.drop(['Candidate','Constituency ∇'],axis=1)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2059 entries, 0 to 2058
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             2059 non-null   int64  
 1   Party          2059 non-null   int64  
 2   Criminal Case  2059 non-null   int64  
 3   Total Assets   2059 non-null   float64
 4   Liabilities    2059 non-null   float64
 5   state          2059 non-null   int64  
 6   Education      2059 non-null   int64  
dtypes: float64(2), int64(5)
memory usage: 112.7 KB


In [None]:
df.head()

Unnamed: 0,ID,Party,Criminal Case,Total Assets,Liabilities,state,Education
0,0,7,4,2110000000.0,20000000.0,23,3
1,1,4,0,10000000.0,0.0,13,1
2,2,8,0,70000000.0,2200000.0,11,9
3,3,4,0,90000000.0,2400000.0,3,9
4,4,4,2,20000000.0,6100000.0,27,3


## CREATING SYNTEHTIC DATA USING CTGAN

In [None]:
X=df.drop(['ID','Education'], axis=1)
y=df['Education']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   

In [None]:
from ctgan import CTGAN

In [None]:
df_final=pd.concat([X_train,y_train],axis=1)
discrete_columns = ['state', 'Party', 'Criminal Case',  'Total Assets', 'Liabilities', 'Education']
ctgan = CTGAN(epochs=20000,log_frequency=True, verbose=True)
ctgan.fit(df_final, discrete_columns)

Gen. (-0.42) | Discrim. (0.01): 100%|██████████| 20000/20000 [1:28:20<00:00,  3.77it/s]   


In [None]:
num_samples = 5000
synthetic_data = ctgan.sample(num_samples)

synthetic_features = synthetic_data.drop(columns=['Education'])
synthetic_labels = synthetic_data['Education']

In [None]:
synthetic_data_df = pd.concat([synthetic_features, synthetic_labels], axis=1)
synthetic_data_df.to_csv('synthetic_data.csv', index=False)

In [None]:
final_training_data=pd.concat([X_train, synthetic_features], axis=0)
final_training_labels=pd.concat([y_train, synthetic_labels], axis=0)

Thus, the final training is done on a datset having the initial train split and the synthetic data.

## TRAINING

### MODELS TO BE TRAINED:
- KNN
- Decision Trees

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

### EVALUATION METRICS:
- Accuracy
- Precision
- Recall
- F1 Score

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

### DECISION TREE CLASSIFIER

In [None]:
param_grid = {'max_depth':range(1,20),'max_leaf_nodes':[None,1,2,3,4,5,6,7,8,9,10],'min_samples_split':[2,3,4,5,6,7,8,9,10],'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10]}
dt_model = DecisionTreeClassifier()
dt_model_cv = GridSearchCV(dt_model,param_grid=param_grid,cv=10,scoring='f1_weighted')
dt_model_cv.fit(final_training_data,final_training_labels)

17100 fits failed out of a total of 188100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
17100 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/aditikhandelia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/aditikhandelia/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Users/aditikhandelia/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/aditikhandelia/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_val

In [None]:
print('best parameters:', dt_model_cv.best_params_)
print('best score:', dt_model_cv.best_score_)
y_pred=dt_model_cv.predict(X_test)
print('f1 score:', f1_score(y_test,y_pred,average='weighted'))
print('accuracy:', accuracy_score(y_test,y_pred))
print('precision:', precision_score(y_test,y_pred,average='weighted'))
print('recall:', recall_score(y_test,y_pred,average='weighted'))

best parameters: {'max_depth': 9, 'max_leaf_nodes': None, 'min_samples_leaf': 3, 'min_samples_split': 8}
best score: 0.2638310182819531
f1 score: 0.21831849139361328
accuracy: 0.24029126213592233
precision: 0.22196272811704104
recall: 0.24029126213592233


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### KNN

In [None]:
param_grid = {'n_neighbors':range(1,50)}
knn_model = KNeighborsClassifier()
knn_model_cv = GridSearchCV(knn_model,param_grid=param_grid,cv=10,scoring='f1_weighted')
knn_model_cv.fit(final_training_data,final_training_labels)

In [None]:
print('best parameters:', knn_model_cv.best_params_)
print('best score:', knn_model_cv.best_score_)
y_pred=knn_model_cv.predict(X_test)
print('f1 score:', f1_score(y_test,y_pred,average='weighted'))
print('accuracy:', accuracy_score(y_test,y_pred))
print('precision:', precision_score(y_test,y_pred,average='weighted'))
print('recall:', recall_score(y_test,y_pred,average='weighted'))

best parameters: {'n_neighbors': 8}
best score: 0.3296532345368187
f1 score: 0.15380032862750484
accuracy: 0.15776699029126215
precision: 0.16218915327094113
recall: 0.15776699029126215


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### TRAINING DIFFERENT MODELS FOR DIFFERENT STATES

It is to be noticed that different states have differnt features that they correlate to well. Thus, different models might help in capturing this.

In [None]:
# make separate datasets for separate states
unique_states = final_training_data['state'].unique()
state_df=[]
state_df_labels=[]
state_df_test=[]
state_df_test_labels=[]
for i in unique_states:
    state_df.append(final_training_data[final_training_data['state']==i])
    state_df_labels.append(final_training_labels[final_training_data['state']==i])
for i in unique_states:
    state_df_test.append(X_test[X_test['state']==i])
    state_df_test_labels.append(y_test[X_test['state']==i])

In [None]:
models=[]
for i in range(len(state_df)):
    param_grid = {'n_neighbors':range(1,2)}    
    knn_model_cv=GridSearchCV(KNeighborsClassifier(),param_grid=param_grid,cv=5,scoring='f1_weighted')
    knn_model_cv.fit(state_df[i],state_df_labels[i])   
    models.append(knn_model_cv)
    print('state:',le.inverse_transform([unique_states[i]]))
    print('best parameters:', knn_model_cv.best_params_)
    print('best score:', knn_model_cv.best_score_)
    y_pred=knn_model_cv.predict(state_df_test[i])
    print('f1 score:', f1_score(state_df_test_labels[i],y_pred,average='weighted'))
    print('-------------------------')



state: ['UTTAR PRADESH']
best parameters: {'n_neighbors': 1}
best score: 0.24937205266044277
f1 score: 0.19054054054054054
-------------------------
state: ['CHHATTISGARH']
best parameters: {'n_neighbors': 1}
best score: 0.2849722926646004
f1 score: 0.4242424242424242
-------------------------
state: ['RAJASTHAN']
best parameters: {'n_neighbors': 1}
best score: 0.18425322128851543
f1 score: 0.35037878787878785
-------------------------
state: ['NAGALAND']
best parameters: {'n_neighbors': 1}
best score: 0.49004535147392286
f1 score: 0.3333333333333333
-------------------------
state: ['MADHYA PRADESH']
best parameters: {'n_neighbors': 1}
best score: 0.16635532119972957
f1 score: 0.29743589743589743
-------------------------
state: ['ARUNACHAL PRADESH']
best parameters: {'n_neighbors': 1}
best score: 0.2052910052910053
f1 score: 0.08333333333333333
-------------------------
state: ['HIMACHAL PRADESH']
best parameters: {'n_neighbors': 1}
best score: 0.0874074074074074
f1 score: 0.0
------



state: ['TAMIL NADU']
best parameters: {'n_neighbors': 1}
best score: 0.28516426799321537
f1 score: 0.08947368421052633
-------------------------
state: ['KERALA']
best parameters: {'n_neighbors': 1}
best score: 0.2843589743589744
f1 score: 0.13333333333333333
-------------------------
state: ['KARNATAKA']
best parameters: {'n_neighbors': 1}
best score: 0.24039979353894947
f1 score: 0.2253787878787879
-------------------------
state: ['GUJARAT']
best parameters: {'n_neighbors': 1}
best score: 0.14869854782898262
f1 score: 0.14603174603174604
-------------------------
state: ['MANIPUR']
best parameters: {'n_neighbors': 1}
best score: 0.2656746031746032
f1 score: 0.07142857142857142
-------------------------
state: ['BIHAR']
best parameters: {'n_neighbors': 1}
best score: 0.20586009837912952
f1 score: 0.21001221001221002
-------------------------
state: ['DELHI']
best parameters: {'n_neighbors': 1}
best score: 0.29003896103896104
f1 score: 0.16
-------------------------
state: ['PUNJAB']



Some states have F1 score 0, and very few have values higher than the our base performance of F1 score 0.24.