# Model Testing and Selection
   <p>I am focusing on finding the best model that classifies the data to real and fake. For this, the ML classification algorithms: LogisticRegression,QuadraticDiscriminantAnalysis, SVC, LinearSVC, DecisionTreeClassifier, XGBoost, AdaBoost, Perceptron, KNN, GaussianNB, BaggingClassifier are being used. For faster computation, the dataset is sampled down to a size of 2000.</p>
    <p>Further Gridsearch analysis is required to find the best parameters that results in the best models that can be fitted for training.</p>
    
#### Initial loading and cleaning of data

In [None]:
# importing the libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [None]:
# loading the data from the downloaded csv file
file_path = input("Enter the path of the data file")
data = pd.read_csv(file_path)
data.drop(columns='Unnamed: 0',inplace=True)
data.dropna(inplace=True)

# encoding the class labels to numerical
class_mapping = {label: idx for idx, label in enumerate(np.unique(data['target']))}
data['target'] = data['target'].map(class_mapping)

# subsetting the data frame for faster computation
sample_size = 2000
data_sample = data.sample(n=sample_size,random_state=22)

# Tokenize and removing stop words
stop_words = set(stopwords.words('english'))
def clean_text(text):
    # Tokenize
    tokens = word_tokenize(text)
    # remove non-alphabetical characters and stopwords
    cleaned_tokens = [re.sub(r'[^a-zA-Z ]', '', text).lower() for text in tokens if text.lower() not in stop_words]
    cleaned_tokens = [token for token in cleaned_tokens if ((token not in  set(string.punctuation)))]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in cleaned_tokens]
    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    #stem the tokens
    porter = PorterStemmer()
    cleaned_text = " ".join(porter.stem(token) for token in processed_text.split())
    return cleaned_text

# Apply the function across the DataFrame
data_sample['cleaned_text'] = data_sample['text'].apply(clean_text)

In [None]:
# defining the X and y 
X = data_sample['cleaned_text'].values
y = data_sample['target'].values
# splitting data to train-test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=22)
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

# Model Testing

In [None]:
# importing the classification algorithms

from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier


In [None]:
# calling all the ML classification algorithms imported above
ppn = Perceptron(eta0=0.1, random_state=1)
lr_solver1 = LogisticRegression(C=100.0, solver='lbfgs', multi_class='ovr')
lr_solver2 = LogisticRegression(C=100.0, solver='liblinear', multi_class='ovr')
nb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=5, p=2)
QDA = QuadraticDiscriminantAnalysis()
svm_linear = SVC(kernel='linear', C=1.0, random_state=1)
svm_rbf = SVC(kernel='rbf', random_state=1, gamma=0.10, C=10.0)
linear_svc = LinearSVC(dual="auto", random_state=0, tol=1e-5)
tree_gini = DecisionTreeClassifier(criterion='gini',max_depth=4,random_state=1)
tree_entropy = DecisionTreeClassifier(criterion='entropy',max_depth=4,random_state=1)
abc = AdaBoostClassifier(algorithm='SAMME',n_estimators=100,learning_rate=0.1,random_state=1)
RF = RandomForestClassifier(n_estimators=20,random_state=1,n_jobs=2)
bag = BaggingClassifier(n_estimators=100,max_samples=1.0,max_features=1.0,bootstrap=True,bootstrap_features=False,n_jobs=1,random_state=1)
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.01,max_depth=4, random_state=1,use_label_encoder=False)
mlp = MLPClassifier(alpha=1, max_iter=100, random_state=42)
GPC = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42)

In [None]:
# defining a dictionary containing all the algorithms and their names
clf_dict = {'perceptron':ppn, 'Log Reg lbfgs':lr_solver1, 'Log Reg liblinear':lr_solver2, 'KNN':knn, 'Linear kernel svm':svm_linear, 'RBF kernel svm':svm_rbf, 'Linear SVC':linear_svc, 'Decision Tree gini':tree_gini,'Decision Tree entropy':tree_entropy, 'AdaBoost':abc, 'RandomForest':RF,'Bagging Clf':bag,'xgb':xgb_model}

for clf_name, clf in clf_dict.items():
    clf_tfidf = Pipeline([
    ('vect',tfidf),
    (clf_name,clf)])
    clf_tfidf.fit(X_train,y_train)
    print(f'Test Accuracy for {clf_name}: {clf_tfidf.score(X_test, y_test):.3f}')
    print('----------------------------------------------')
    
#'GaussianNB':nb,
#'Quadratic Discriminant Analysis':QDA,
#,'MLP':mlp,'Gaussian PC':GPC

 From the preliminary analysis, out of the algorithms tested for accuracy, it is found that Decision tree, Adaboost, Bagging, and xgb have the highest accuracy score of 0.992 for the sampled dataset. Knn performed the worst with a score of 0.763.