## FINAL MACHINING PROJECT

In [3]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [4]:
df=pd.read_csv("MAlayalam_char_glcm_features.csv")
df.drop("fname",axis=1,inplace=True)
df.head(3)

Unnamed: 0,dissimilarity_0,dissimilarity_45,dissimilarity_90,dissimilarity_135,correlation_0,correlation_45,correlation_90,correlation_135,homogeneity_0,homogeneity_45,...,contrast_135,ASM_0,ASM_45,ASM_90,ASM_135,energy_0,energy_45,energy_90,energy_135,label
0,17.216435,21.119725,20.889887,21.54333,0.491436,0.390284,0.389736,0.378055,0.932486,0.917179,...,5493.549025,0.804286,0.788199,0.790551,0.786816,0.89682,0.887806,0.889129,0.887027,3333
1,12.143708,15.748918,14.495192,15.859307,0.504656,0.375442,0.41403,0.371065,0.952378,0.93824,...,4044.123377,0.858505,0.843167,0.849379,0.842788,0.926556,0.918241,0.921618,0.918035,3333
2,17.216435,21.119725,20.889887,21.54333,0.491436,0.390284,0.389736,0.378055,0.932486,0.917179,...,5493.549025,0.804286,0.788199,0.790551,0.786816,0.89682,0.887806,0.889129,0.887027,3333


### Understanding about the dataframe

In [5]:
def information(df):
    # Finding the numbers of columns

    num_columns = len(df.columns)
    print("Number of columns:", num_columns)

    # Shape of data frame
    print(f"The shape of the data frame is : {df.shape}")

    # The number of Unique values in label
    unique_counts = df["label"].value_counts()
    print(f"Unique value counts in 'Label' :  {len(unique_counts)}")
    

information(df)


Number of columns: 25
The shape of the data frame is : (3287, 25)
Unique value counts in 'Label' :  48


In [6]:
# import all the modules
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.model_selection import cross_val_score


### Preparing the data

In [7]:
x = df.drop('label', axis=1)# Features
y = df['label']  # Target  

# Splitting the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Evaluation metrics for checking the accuracy

In [8]:
def evaluate_preds(y_true,y_preds):
    accuracy = accuracy_score(y_true,y_preds)
    precision = precision_score(y_true,y_preds,average='macro')
    recall = recall_score(y_true,y_preds,average='macro')
    f1 = f1_score(y_true,y_preds,average='macro')
    metric_dict = { "accuracy"   : round(accuracy,2),
                    "precision"  : round(precision,2),
                    "recall"     : round(recall,2),
                    "f1 "        : round(f1,2)
                  }
                   
    print(f"Acc        : {accuracy*100:.2f}%")
    print(f"Precision  : {precision*100:.2f}%")
    print(f"Recall     : {recall*100:.2f}%")
    print(f"F1_score   : {f1*100:.2f}%")
    return metric_dict

In [9]:
def cross_fun(clf,x,y):
    cv_score=cross_val_score(clf,x,y,cv=5)
    print(f"The cross validation accuracy MEAN is : {np.mean(cv_score)*100:.2f}%")
    print(f"The cross validation accuracy STD  is : {np.std(cv_score)*100:.2f}")
    

#### KNN model

In [10]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(x_train, y_train)

y_test_preds = knn.predict(x_test.values)
y_train_preds=knn.predict(x_train.values)



print(f"Report for train data")
evaluate_preds(y_train,y_train_preds)
print("  ")
print("  ")
print("  ")
print(f"Report for test data")
evaluate_preds(y_test,y_test_preds)

print(" ")
print(" ")
print(f"N FOLD SCORE")
cross_fun(knn,x,y)

Report for train data
Acc        : 60.14%
Precision  : 59.41%
Recall     : 56.65%
F1_score   : 55.20%
  
  
  
Report for test data
Acc        : 47.72%
Precision  : 33.88%
Recall     : 38.02%
F1_score   : 34.33%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : nan%
The cross validation accuracy STD  is : nan


### Random Forest

In [11]:
model = RandomForestClassifier()

model.fit(x_train, y_train)

y_test_preds = model.predict(x_test.values)
y_train_preds= model.predict(x_train.values)



print(f"Report for train data")
evaluate_preds(y_train,y_train_preds)
print("  ")
print("  ")
print("  ")
print(f"Report for test data")
evaluate_preds(y_test,y_test_preds)

print(" ")
print(" ")
print(f"N FOLD SCORE")
cross_fun(model,x,y)

Report for train data
Acc        : 82.08%
Precision  : 87.91%
Recall     : 89.58%
F1_score   : 88.18%
  
  
  
Report for test data
Acc        : 61.85%
Precision  : 62.61%
Recall     : 61.56%
F1_score   : 58.88%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 59.17%
The cross validation accuracy STD  is : 2.15


###  DecisionTreeClassifier

In [96]:
des_trees = DecisionTreeClassifier(random_state=42)

des_trees.fit(x_train, y_train)

y_test_preds = des_trees.predict(x_test.values)
y_train_preds= des_trees.predict(x_train.values)



print(f"Report for train data")
evaluate_preds(y_train,y_train_preds)
print("  ")
print("  ")
print("  ")
print(f"Report for test data")
evaluate_preds(y_test,y_test_preds)

print(" ")
print(" ")
print(f"N FOLD SCORE")
cross_fun(des_trees,x,y)

Report for train data
Acc        : 82.08%
Precision  : 87.91%
Recall     : 89.58%
F1_score   : 88.18%
  
  
  
Report for test data
Acc        : 58.36%
Precision  : 55.42%
Recall     : 56.36%
F1_score   : 53.88%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 55.43%
The cross validation accuracy STD  is : 2.04%


### MLP

In [83]:
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, alpha=0.0001,
                    solver='adam', verbose=False, random_state=42, tol=0.0001)

mlp.fit(x_train, y_train)

y_test_preds = mlp.predict(x_test.values)
y_train_preds= mlp.predict(x_train.values)



print(f"Report for train data")
evaluate_preds(y_train,y_train_preds)
print("  ")
print("  ")
print("  ")
print(f"Report for test data")
evaluate_preds(y_test,y_test_preds)

print(" ")
print(" ")
print(f"N FOLD SCORE")
cross_fun(mlp,x,y)


Report for train data
Acc        : 16.81%
Precision  : 8.99%
Recall     : 12.62%
F1_score   : 7.23%
  
  
  
Report for test data
Acc        : 14.44%
Precision  : 8.91%
Recall     : 10.56%
F1_score   : 5.76%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 8.94%
The cross validation accuracy STD  is : 0.22%


### SVM model

In [89]:
svm = SVC(kernel='rbf', random_state=42)

svm.fit(x_train, y_train)

y_test_preds = svm.predict(x_test.values)
y_train_preds= svm.predict(x_train.values)



print(f"Report for train data")
evaluate_preds(y_train,y_train_preds)
print("  ")
print("  ")
print("  ")
print(f"Report for test data")
evaluate_preds(y_test,y_test_preds)

print(" ")
print(" ")
print(f"N FOLD SCORE")
cross_fun(svm,x,y)

Report for train data
Acc        : 28.19%
Precision  : 11.96%
Recall     : 14.89%
F1_score   : 11.00%
  
  
  
Report for test data
Acc        : 24.92%
Precision  : 13.42%
Recall     : 14.49%
F1_score   : 10.50%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 26.35%
The cross validation accuracy STD  is : 1.43%


### Naive_bayes

In [93]:
nb = MultinomialNB()

nb.fit(x_train, y_train)

y_test_preds =nb.predict(x_test.values)
y_train_preds=nb.predict(x_train.values)



print(f"Report for train data")
evaluate_preds(y_train,y_train_preds)
print("  ")
print("  ")
print("  ")
print(f"Report for test data")
evaluate_preds(y_test,y_test_preds)

print(" ")
print(" ")
print(f"N FOLD SCORE")
cross_fun(nb,x,y)

Report for train data
Acc        : 19.86%
Precision  : 14.52%
Recall     : 19.08%
F1_score   : 13.77%
  
  
  
Report for test data
Acc        : 20.36%
Precision  : 18.07%
Recall     : 20.70%
F1_score   : 15.89%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 18.31%
The cross validation accuracy STD  is : 0.95%


### LogisticRegression

In [94]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs')

log_reg.fit(x_train, y_train)

y_test_preds =log_reg.predict(x_test.values)
y_train_preds=log_reg.predict(x_train.values)


print(f"Report for train data")
evaluate_preds(y_train,y_train_preds)
print("  ")
print("  ")
print("  ")
print(f"Report for test data")
evaluate_preds(y_test,y_test_preds)

print(" ")
print(" ")
print(f"N FOLD SCORE")
cross_fun(log_reg,x,y)


Report for train data
Acc        : 19.32%
Precision  : 8.12%
Recall     : 11.85%
F1_score   : 8.67%
  
  
  
Report for test data
Acc        : 17.02%
Precision  : 8.22%
Recall     : 11.79%
F1_score   : 8.54%
 
 
N FOLD SCORE
The cross validation accuracy MEAN is : 16.52%
The cross validation accuracy STD  is : 1.42%
