## FINAL MACHINING PROJECT

In [2]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
df=pd.read_csv("MAlayalam_char_glcm_features.csv")
df.drop("fname",axis=1,inplace=True)
df.head(3)

Unnamed: 0,dissimilarity_0,dissimilarity_45,dissimilarity_90,dissimilarity_135,correlation_0,correlation_45,correlation_90,correlation_135,homogeneity_0,homogeneity_45,...,contrast_135,ASM_0,ASM_45,ASM_90,ASM_135,energy_0,energy_45,energy_90,energy_135,label
0,17.216435,21.119725,20.889887,21.54333,0.491436,0.390284,0.389736,0.378055,0.932486,0.917179,...,5493.549025,0.804286,0.788199,0.790551,0.786816,0.89682,0.887806,0.889129,0.887027,3333
1,12.143708,15.748918,14.495192,15.859307,0.504656,0.375442,0.41403,0.371065,0.952378,0.93824,...,4044.123377,0.858505,0.843167,0.849379,0.842788,0.926556,0.918241,0.921618,0.918035,3333
2,17.216435,21.119725,20.889887,21.54333,0.491436,0.390284,0.389736,0.378055,0.932486,0.917179,...,5493.549025,0.804286,0.788199,0.790551,0.786816,0.89682,0.887806,0.889129,0.887027,3333


### Understanding about the dataframe

In [4]:
def information(df):
    # Finding the numbers of columns

    num_columns = len(df.columns)
    print("Number of columns:", num_columns)

    # Shape of data frame
    print(f"The shape of the data frame is : {df.shape}")

    # The number of Unique values in label
    unique_counts = df["label"].value_counts()
    print(f"Unique value counts in 'Label' :  {len(unique_counts)}")
    

information(df)


Number of columns: 25
The shape of the data frame is : (3287, 25)
Unique value counts in 'Label' :  48


### Count of each label in dataframe

In [3]:
label_values = list(df['label'].unique())
label_counts = df['label'].value_counts()
print(label_counts)

3362    160
3361    160
3360    160
3359    160
3357    138
3355    135
3358    132
3350    124
3352    116
3363    106
3333     80
3356     80
3354     80
3353     80
3351     80
3349     80
3346     80
3343     80
3342     80
3337     80
3335     80
3334     80
3377     44
3376     43
3381     43
3366     43
3375     42
3367     42
3365     42
3378     42
3373     41
3374     41
3368     40
3370     40
3364     39
3372     37
3380     36
3379     36
3371     36
3382     30
3451     30
3454     30
3383     29
3384     29
3452     29
3453     28
3385     24
3450     20
Name: label, dtype: int64


### Evaluation metrics for checking the accuracy

In [19]:
def evaluate_preds(y_true,y_preds):
    accuracy = accuracy_score(y_true,y_preds)
    precision = precision_score(y_true,y_preds,average='macro')
    recall = recall_score(y_true,y_preds,average='macro')
    f1 = f1_score(y_true,y_preds,average='macro')
    metric_dict = { "accuracy"   : round(accuracy,2),
                    "precision"  : round(precision,2),
                    "recall"     : round(recall,2),
                    "f1 "        : round(f1,2)
                  }
                   
    print(f"Acc        : {accuracy*100:.2f}%")
    print(f"Precision  : {precision*100:.2f}%")
    print(f"Recall     : {recall*100:.2f}%")
    print(f"F1_score   : {f1*100:.2f}%")
    return metric_dict

In [20]:
def cross_fun(clf,x,y):
    cv_score=cross_val_score(clf,x,y,cv=5)
    print(f"The cross validation accuracy MEAN is : {np.mean(cv_score)*100:.2f}%")
    print(f"The cross validation accuracy STD  is : {np.std(cv_score)*100:.2f}")
    

In [21]:
# import all the modules
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.model_selection import cross_val_score


### Use the Smote to remove the imbalance data

In [22]:
from imblearn.over_sampling import SMOTE
import pandas as pd

# Separate Features and Target Variable
X = df.drop('label', axis=1)  # Features
y = df['label']  # Target variable

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create a new DataFrame with resampled data
resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_df['label'] = y_resampled

# Check the class distribution in the new DataFrame
print(resampled_df['label'].value_counts())

3333    160
3334    160
3368    160
3370    160
3371    160
3372    160
3373    160
3374    160
3375    160
3376    160
3377    160
3378    160
3379    160
3380    160
3381    160
3382    160
3383    160
3384    160
3385    160
3450    160
3451    160
3452    160
3453    160
3367    160
3366    160
3365    160
3353    160
3335    160
3337    160
3342    160
3343    160
3346    160
3349    160
3350    160
3351    160
3352    160
3354    160
3364    160
3355    160
3356    160
3357    160
3358    160
3359    160
3360    160
3361    160
3362    160
3363    160
3454    160
Name: label, dtype: int64


### Feature Selection

In [23]:
from sklearn.feature_selection import RFE
def feature_selection(df,clf):
    x = df.drop(columns=['label'])
    y = df['label']

    num_features_to_select = 5  # You can change this number as needed
    rfe = RFE(estimator=clf, n_features_to_select=num_features_to_select)

    # Fit RFE to the data
    rfe.fit(x, y)

    # Get the selected features
    selected_features = x.columns[rfe.support_]

    # Filter the dataset to keep only the selected features and the target variable
    selected_data = df[selected_features.union(['label'])]
    
    return selected_data

## Performing the Hyper parameter Tuning using the GridSearchCv

In [24]:
forest_params={'n_estimators': [10, 100, 200],
        'max_depth': [None, 5],
        'max_features': ['sqrt'],
        'min_samples_split': [6],
        'min_samples_leaf': [2, 4]}

In [25]:
knn_params = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

In [26]:
decision_tree_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [27]:
svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}


In [28]:
def model_training(df,clf):
    x = df.drop('label', axis=1)# Features
    y = df['label']  # Target  

    # Splitting the data into train and test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


    clf.fit(x_train, y_train)

    y_test_preds = clf.predict(x_test.values)
    y_train_preds= clf.predict(x_train.values)



    print(f"Report for train data")
    evaluate_preds(y_train,y_train_preds)
    print("  ")
    print("  ")
    print("  ")
    print(f"Report for test data")
    evaluate_preds(y_test,y_test_preds)

    print(" ")
    print(" ")
    print(f"N FOLD SCORE")
    cross_fun(clf,x,y)

In [35]:
### GridSearchCV dataframe

In [36]:
def grid_training(df,clf,params):
    x = df.drop('label', axis=1)# Features
    y = df['label']  # Target  

    # Splitting the data into train and test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
     
    gs=GridSearchCV(estimator=clf,
                         param_grid=params,
                         verbose=0)

    gs.fit(x_train,y_train)

    gs_y_test_preds =gs.predict(x_test.values)
    gs_y_train_preds=gs.predict(x_train.values)


    print(f"Report for train data")
    evaluate_preds(y_train,gs_y_train_preds)
    print("  ")
    print("  ")
    print("  ")
    print(f"Report for test data")
    evaluate_preds(y_test,gs_y_test_preds)

    print(" ")
    print(" ")
    print(f"N FOLD SCORE")
    cross_fun(gs,x,y)

    print(f"The best parameters for above model is : ")
    print(gs.best_params_)


### KNN model

In [29]:
knn=KNeighborsClassifier(n_neighbors=5)
model_training(df,knn)

Report for train data
Acc        : 60.14%
Precision  : 59.41%
Recall     : 56.65%
F1_score   : 55.20%
  
  
  
Report for test data
Acc        : 47.72%
Precision  : 33.88%
Recall     : 38.02%
F1_score   : 34.33%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : nan%
The cross validation accuracy STD  is : nan


### KNN smote

In [31]:
knn=KNeighborsClassifier(n_neighbors=5)
model_training(resampled_df,knn)

Report for train data
Acc        : 79.28%
Precision  : 77.44%
Recall     : 79.15%
F1_score   : 77.56%
  
  
  
Report for test data
Acc        : 73.24%
Precision  : 72.01%
Recall     : 73.81%
F1_score   : 72.04%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : nan%
The cross validation accuracy STD  is : nan


### KNN feature selection + smote

In [34]:
knn_df=feature_selection(resampled_df,KNeighborsClassifier(n_neighbors=5))
model_training(knn_df,KNeighborsClassifier(n_neighbors=5))

Report for train data
Acc        : 80.06%
Precision  : 78.16%
Recall     : 79.96%
F1_score   : 78.31%
  
  
  
Report for test data
Acc        : 75.46%
Precision  : 73.68%
Recall     : 75.77%
F1_score   : 73.82%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 74.52%
The cross validation accuracy STD  is : 1.61


### KNN+feature selection + smote + GridSerchCV

In [37]:
grid_training(knn_df,KNeighborsClassifier(n_neighbors=5),knn_params)

Report for train data
Acc        : 89.70%
Precision  : 87.85%
Recall     : 89.58%
F1_score   : 88.10%
  
  
  
Report for test data
Acc        : 82.03%
Precision  : 80.53%
Recall     : 82.41%
F1_score   : 80.74%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 80.52%
The cross validation accuracy STD  is : 1.46
The best parameters for above model is : 
{'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}


### Random Forest  Model

In [38]:
model = RandomForestClassifier()
model_training(df,model)

Report for train data
Acc        : 82.08%
Precision  : 87.91%
Recall     : 89.58%
F1_score   : 88.18%
  
  
  
Report for test data
Acc        : 62.01%
Precision  : 64.00%
Recall     : 62.14%
F1_score   : 59.66%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 58.84%
The cross validation accuracy STD  is : 2.26


### RandomForest smote

In [39]:
model= RandomForestClassifier()
model_training(resampled_df,model)

Report for train data
Acc        : 89.78%
Precision  : 87.86%
Recall     : 89.58%
F1_score   : 88.12%
  
  
  
Report for test data
Acc        : 82.62%
Precision  : 81.76%
Recall     : 83.36%
F1_score   : 81.72%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 83.03%
The cross validation accuracy STD  is : 1.57


### Random Forest + smote + feature selection

In [40]:
model_df=feature_selection(resampled_df,model)
model_training(model_df,model)

Report for train data
Acc        : 89.78%
Precision  : 87.86%
Recall     : 89.58%
F1_score   : 88.12%
  
  
  
Report for test data
Acc        : 80.86%
Precision  : 79.97%
Recall     : 81.45%
F1_score   : 79.81%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 81.68%
The cross validation accuracy STD  is : 1.33


###  Random Forest + smote + feature selection + GridSeaechCv

In [47]:
grid_training(model_df,model,forest_params)

Report for train data
Acc        : 88.87%
Precision  : 86.97%
Recall     : 88.68%
F1_score   : 87.21%
  
  
  
Report for test data
Acc        : 79.17%
Precision  : 78.20%
Recall     : 79.75%
F1_score   : 78.05%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 80.07%
The cross validation accuracy STD  is : 1.97
The best parameters for above model is : 
{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 200}


### Decission Trees

In [42]:
des_trees = DecisionTreeClassifier(random_state=42)
model_training(df,des_trees)

Report for train data
Acc        : 82.08%
Precision  : 87.91%
Recall     : 89.58%
F1_score   : 88.18%
  
  
  
Report for test data
Acc        : 58.36%
Precision  : 55.42%
Recall     : 56.36%
F1_score   : 53.88%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 55.43%
The cross validation accuracy STD  is : 2.04


### Decission + smote

In [43]:
model_training(resampled_df,des_trees)

Report for train data
Acc        : 89.78%
Precision  : 87.86%
Recall     : 89.58%
F1_score   : 88.12%
  
  
  
Report for test data
Acc        : 76.04%
Precision  : 75.01%
Recall     : 76.65%
F1_score   : 75.03%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 75.31%
The cross validation accuracy STD  is : 1.07


### Decission + smote + FeatureSelection 

In [45]:
decission_df=feature_selection(resampled_df,des_trees)
model_training(decission_df,des_trees)

Report for train data
Acc        : 89.78%
Precision  : 87.86%
Recall     : 89.58%
F1_score   : 88.12%
  
  
  
Report for test data
Acc        : 74.67%
Precision  : 73.79%
Recall     : 75.11%
F1_score   : 73.61%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 74.73%
The cross validation accuracy STD  is : 1.04


### Decission + smote + FeatureSelection + GridSearchCv

In [46]:
grid_training(decission_df,des_trees,decision_tree_params)

Report for train data
Acc        : 89.78%
Precision  : 87.86%
Recall     : 89.58%
F1_score   : 88.12%
  
  
  
Report for test data
Acc        : 76.69%
Precision  : 75.58%
Recall     : 77.47%
F1_score   : 75.69%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 75.34%
The cross validation accuracy STD  is : 1.73
The best parameters for above model is : 
{'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


### SVM model

In [12]:
svm = SVC(kernel='rbf', random_state=42)
model_training(df,svm)

Report for train data
Acc        : 28.19%
Precision  : 11.96%
Recall     : 14.89%
F1_score   : 11.00%
  
  
  
Report for test data
Acc        : 24.92%
Precision  : 13.42%
Recall     : 14.49%
F1_score   : 10.50%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 26.35%
The cross validation accuracy STD  is : 1.43


### SVM model + smote

In [None]:
model_training(re,svm)

### SVM model + smote + feature selection