### Importing packages

In [None]:
import pandas as pd
from pandas import DataFrame as df
import numpy as np
import seaborn as sns
import re

import nltk
import csv
import nltk.tokenize 
from nltk.tokenize import word_tokenize
nltk.download('punkt', quiet=True)
from nltk.corpus import stopwords
from sklearn.manifold import TSNE

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from keras.utils.np_utils import to_categorical

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


from sklearn import metrics
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

### Loading dataset

In [None]:
data_csv = pd.read_csv(r"C:\UCD\Modules\Summer\Week 02\ML_Project\21206774.csv",encoding='utf-8', header=0, index_col = False)
data_csv

### Pre-analysis exploring and cleaning process

In [None]:
# The first 10 records
data_csv.head(10)

In [None]:
# The last 10 records
data_csv.tail(10)

#### Data exploration and initial organising

In [None]:
# Display all the columns where the values are null
data_csv.isnull().sum()

#### fixing the cells with null values

In [None]:
# Replacing the null cells with some values
data_csv["short_description"] = data_csv["short_description"].fillna("no_description")

In [None]:
data_csv["headline"] = data_csv["headline"].dropna()

In [None]:
# Drop the empty records
data_csv["category"] = data_csv["category"].dropna()

In [None]:
data_csv["date"] = data_csv["date"].fillna("no_date")

In [None]:
data_csv["authors"] = data_csv["authors"].fillna("no_authors")

In [None]:
data_csv["link"] = data_csv["link"].fillna("no_link")

In [None]:
# make shure the dataset is fixed and no null values are there any more.

data_csv.isnull().sum()

In [None]:
# Convert the whole csv to lowercase
data_csv = data_csv.applymap(lambda s: s.lower() if type(s) == str else s)

## https://stackoverflow.com/questions/39512002/convert-whole-dataframe-from-lower-case-to-upper-case-with-pandas

In [None]:
# Save to a new csv output file
data_csv.to_csv('C:/UCD/Modules/Summer/Week 02/ML_Project/outputfile.csv', encoding='utf-8')

In [None]:
# load the new dataset
newdata_csv = pd.read_csv(r"C:\UCD\Modules\Summer\Week 02\ML_Project\outputfile.csv",encoding='utf-8', header=0, index_col = False)
newdata_csv

In [None]:
# Delete all the "Unnamed" columns
newdata_csv.drop(newdata_csv.filter(regex="Unnamed"),axis=1, inplace=True)

#### I manually removed the extra unnamed columns and gave the name "index" to the first column

In [None]:
# Check the the remaining null values through the whole dataset
newdata_csv.isnull().any()

In [None]:
# classify the categories
newdata_csv["headline"] = newdata_csv["headline"].dropna()

In [None]:
newdata_csv.dropna(subset=['headline'])

In [None]:
newdata_csv['category'].value_counts()

In [None]:
# Visualize the category data
sns.countplot(newdata_csv.category)
## https://seaborn.pydata.org/generated/seaborn.countplot.html

In [None]:
#Creat additional column with the length for each headline 
#and short description columns 
   

newdata_csv ['hd_length']= newdata_csv['headline'].str.len()
newdata_csv ['short_des']= newdata_csv['short_description'].str.len()
print (newdata_csv ['hd_length'])
print (newdata_csv ['short_des'])


In [None]:
newdata_csv

In [None]:
#visualize the headline distribution
sns.distplot(newdata_csv['hd_length']).set_title('Headlines Distribution')

## https://seaborn.pydata.org/generated/seaborn.distplot.html

In [None]:
#visualize the headline distribution
sns.distplot(newdata_csv['short_des']).set_title('Short Description Distribution')

#### From the readings of the up-generated visualizations, I suspect that there is some mixing between the headlines and the short description in the dataset, which refers to the missing data from "short decription" column.

### NLT process

In [None]:
## https://stackoverflow.com/questions/39782418/remove-punctuations-in-pandas

# Recognise and remove the punctuation
newdata_csv['headline'].str.replace('[^\w\s]','')


In [None]:
# Tokenize the headline column
headlines = newdata_csv['headline']
headlines = [nltk.word_tokenize(headline) for headline in headlines]
stop_words = set(stopwords.words('english'))


headlines

In [None]:
# Remove the stopwords from the headline column, and untkenize the words
## https://www.datasnips.com/58/remove-stop-words-from-text-in-dataframe-column/

from nltk.corpus import stopwords

stop_words = stopwords.words('english')
newdata_csv['headline'] = newdata_csv['headline'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
# Assigning the tokenized headline words to anew column "tokenized_headlines"
newdata_csv ['tokenized_headlines'] = newdata_csv ['headline']

In [None]:
newdata_csv.head()

#### Notes: I replaced the null values in all the columns with "no_headlines, No_short_desc,....".
#### After exploring and analyzing the data, I decided to work on the "headline" column since therer is only 14 headlines missing, while in the  "short decription" column is having 662 missing rows (around 9% of the dataset), so I decided not to depend on this column in this analysis. 
#### Then cleaned the data and removed all the punctuation and unnecessary words (stop words) and assigned the cleaned data to a new column "tokenized_headlines"


# Data preperation & modeling

### Encoding

In [None]:
# Applying label encoding

from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
newdata_csv['category_encoded'] = encoder.fit_transform(newdata_csv['category'])
newdata_csv

In [None]:
# Saving the preprocessed file to avoid executing the whole process in case we closed Jupyter
# So we can start from the loading point of the preprocessed file. 
newdata_csv.to_csv('C:/UCD/Modules/Summer/Week 02/ML_Project/preprocessed_data.csv', encoding='utf-8')

In [None]:
# load the preprocessed_csv as a starting point

preprocessed_csv = pd.read_csv(r"C:\UCD\Modules\Summer\Week 02\ML_Project\preprocessed_data.csv",encoding='utf-8', header=0)
preprocessed_csv.drop(preprocessed_csv.filter(regex="Unnamed"),axis=1, inplace=True)
preprocessed_csv

In [None]:
preprocessed_csv

In [None]:
preprocessed_csv['category_encoded'].value_counts()
# 0 is for the titles with no category, 1 is for the travel news category, 2 is for the weird news

### Splitting and training the data

In [None]:
from sklearn.model_selection import train_test_split

X = preprocessed_csv['tokenized_headlines'] # The data
y = preprocessed_csv['category_encoded'] #The target

X_train_plus_valid, X_test, y_train_plus_valid, y_test = train_test_split(X, y, random_state=0, test_size = 0.3, train_size = 0.7)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_plus_valid, y_train_plus_valid, random_state=0, test_size = 0.199/0.7, train_size = 0.5/0.7)


In [None]:
train = X_train, y_train
valid = X_valid, y_valid
test = X_test, y_test

In [None]:
train

In [None]:
valid

In [None]:
print("X_train shape: {}".format(X_train.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_train shape: {}".format(y_train.shape))
print("y_test shape: {}".format(y_test.shape))
print("X_valid shape: {}".format(y_train.shape))
print("y valid shape: {}".format(y_test.shape))

In [None]:
X = preprocessed_csv.iloc[:,1:65]
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
#split the data into train valid and test set

#save the data
train.to_csv('train.csv',index=False)
valid.to_csv('valid.csv',index=False)
test.to_csv('test.csv', index=False)

#### Loading (training and validation csv files)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression #A variant regression for classification tasks!
from sklearn.naive_bayes import GaussianNB as NaiveBayes
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.utils.np_utils import to_categorical
from sklearn.metrics import accuracy_score

In [None]:
train_csv = pd.read_csv('train.csv')

In [None]:
valid_csv = pd.read_csv('valid.csv')

In [None]:
test_csv = pd.read_csv('test.csv')

In [None]:
# AS we are working with text data, we will need to convert that text into numerical form
# I will use tfidf victorizer to do that task
# first we will need the following parameters for this task


# Parameters for the tfidf
ngram_range = (1, 2)
min_df = 10
max_df = 1.
max_features = 300

In [None]:
# tfidf victorizer
# Tomap the most frequent words and compute their occurance
vectorizer = TfidfVectorizer(encoding ='utf-8', ngram_range = ngram_range, 
                         lowercase = False, 
                        max_df = max_df, min_df = min_df,
                        max_features = max_features, norm = 'l2',# for normalization,
                        stop_words=None, sublinear_tf = True)

# Store all of the training data features in variables (to use them in the ML algorithms)
feature_store_train = vectorizer.fit_transform (X_train).toarray()
labels_train = y_train

#Store all of the validation data features
feature_store_valid = vectorizer.transform (X_valid).toarray()
labels_valid = y_valid

feature_store_test = vectorizer.transform (X_test).toarray()
labels_test = y_test

In [None]:
labels_train.shape # X train value

In [None]:
labels_valid.shape # X valid value

In [None]:
labels_test.shape # X test value

In [None]:
feature_store_train.shape # y train value

In [None]:
feature_store_valid.shape # y valid value

### Building classification models

**I will be using two clssifiers, LogisticRegression and KNeighbors as they can easily deal with text classification and match the text to the category it beelongs to.

In [None]:
# Applying LogisticRegression classifier 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lg = LogisticRegression()
lg.fit(feature_store_train, labels_train)

# Predict and evaluate the model using the accuracy metric
model_predictions = lg.predict(feature_store_test)
print('Accuracy', accuracy_score(labels_test, model_predictions))
print(metrics.classification_report(labels_test,model_predictions))

In [None]:
# Applying KNeighbors classifier

from sklearn.neighbors import KNeighborsClassifier
KNeighbors = KNeighborsClassifier()
KNeighbors.fit(feature_store_train, labels_train)

# Predict and evaluate the model using the accuracy metric
model_predictions = KNeighbors.predict(feature_store_test)
print('Accuracy', accuracy_score(labels_test, model_predictions))
print(metrics.classification_report(labels_test,model_predictions))

**Findings:
-LogisticsRegression classifier (lg) 
As we can notice the scores are good, between 0.86 and 0.98 ,however the category 1 achieved averagely relevant recall score (0.50) and the same for the f1 score (0.64).
-As for Kneighbors classifier (KNeighbors)
The accuracy score was almost the same as lg (0.81), and the other scores generally are good, however the scores of the category 1 which are relatively average 0.65 for precision, 0.54 for recall, and score of 0.59 for f1.

In [None]:
# Evaluation LogisticRegression using the confusion matrix
# I chose the confusion matrix to make sure the results are ok

X_cm = feature_store_train
y_true_labels = labels_train
model = lg

y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))

cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 

**The confusion matrix generated scores for lg shows almost the same ones of the same as the accuracy metric that I used in the LogisticRegression's classifier for the both categories 0 and 1 and that confirms the accuracy of the scores that I got in the lg classifier. 

**the plot is summerizing the values as following :TP score is 2934, TN is 538, FP is 51, FN is 459, FP is low which is a good indication, while the FN is a bit high

In [None]:
# Evaluation KNeighbors using the confusion matrix
X_cm = feature_store_train
y_true_labels = labels_train
model = KNeighbors

y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))

cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 

**We note a little improvement in the scores in general comparing to the accuracy metric that I used in the KNeighbors classifier. 
the plot is summerizing the values as following :TP score is 2808, TN is 619, FP is 177, FN is 378, FP is a bit low which is a good indication, while the FN is a bit high

### Evaluation of the performance of the models on the validation

In [None]:
# Applying LogisticRegression classifier on validation set 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lg = LogisticRegression()
lg.fit(feature_store_valid, labels_valid)

# Predict and evaluate the model using the accuracy metric
model_predictions = lg.predict(feature_store_test)
print('Accuracy', accuracy_score(labels_test, model_predictions))
print(metrics.classification_report(labels_test,model_predictions))

In [None]:
# Evaluating confusion matrix on the validation set (LogisticsRegression)

X_cm = feature_store_valid
y_true_labels = labels_valid
model = lg

y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))

cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 

**Applying LogisticRegression classifier on validation set shows good result generally, however the recall score for the category 1 is very low 0.39 and the precision is high 0.89, but with the confusion matrix the results get better (the precision is 95 and the recall is 0.51) however the accuracy scores for both the accuracy metric and the confusion matrix are quite similar.

**the plot is summerizing the values as following :TP score is 1166, TN is 208, FP is 12, FN is 200. FP is low which is a good indication, while the FN is a bit high

In [None]:
# Applying KNeighbors classifier with the validation dataset
from sklearn.neighbors import KNeighborsClassifier
KNeighbors = KNeighborsClassifier()
KNeighbors.fit(feature_store_valid, labels_valid)

# Predict and evaluate the model using the accuracy metric
model_predictions = KNeighbors.predict(feature_store_test)
print('Accuracy', accuracy_score(labels_test, model_predictions))
print(metrics.classification_report(labels_test,model_predictions))

In [None]:
# Evaluating confusion matrix on the validation set (LogisticsRegression)

X_cm = feature_store_valid
y_true_labels = labels_valid
model = KNeighbors

y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))

cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()   
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 

**The results for the KNeighbors validation are almost the same with the both the accuracy metric and the confusion matrix are almost similar similar however the results becom a bit better after running the confusion matrix.

**the plot is summerizing the values as following :TP score is 1124, TN is 234, FP is 54, FN is 174. FP is low which is a good indication.

### Error analysis

In [None]:
# LogisticRegression misclassification rate
# I will be using the error rate equasion based on the values of the confusion matrix
# that I ran on each model 
## https://www.ritchieng.com/machine-learning-evaluate-classification-model/

FP = 51
FN = 459
TP = 2934
TN = 538
classification_error = (FP + FN) / float(TP + TN + FP + FN)

print(classification_error)

In [None]:
# Kneighbors misclassification rate
FP = 177
FN = 378
TP = 2808
TN = 619
classification_error = (FP + FN) / float(TP + TN + FP + FN)

print(classification_error)

In [None]:
# LogisticRegression on validation misclassification rate

FP = 12
FN = 200
TP = 1166
TN = 208
classification_error = (FP + FN) / float(TP + TN + FP + FN)

print(classification_error)

In [None]:
# Kneighbors on validation misclassification rate

FP = 54
FN = 179
TP = 1124
TN = 234
classification_error = (FP + FN) / float(TP + TN + FP + FN)

print(classification_error)

### Applying changes to test the classifiers

##### LogisticsRegression classifier

#### Hyperparameter tuning

In [None]:
# https://www.youtube.com/watch?v=nFna2s244vA&ab_channel=SolveBusinessProblemsUsingAnalytics

from sklearn.model_selection import GridSearchCV

params_grid = {'C':[0.1, 0.001, 1], 'penalty' : ['l1','l2']} #applying penalty in order to reduce overfitting
model = LogisticRegression()
clf = GridSearchCV(model, params_grid, cv = 3, verbose = 1)
bestF = clf.fit (feature_store_train, labels_train)

In [None]:
# extract the best parameters out of the hyperparameter

bestF.best_params_

In [None]:
# Applying change to the LogisticRegression classifier according to the best parameter extracted

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lg = LogisticRegression(C=1, penalty = 'l2')
lg.fit(feature_store_train, labels_train)

# Predict and evaluate the model using the accuracy metric
model_predictions = lg.predict(feature_store_test)
print('Accuracy', accuracy_score(labels_test, model_predictions))
print(metrics.classification_report(labels_test,model_predictions))

By comparing the results of the classifier  with the old ones We can notice that the scores didn't change, no increase or decrease occured, which means that the values are the best we can acheive from the LogisticRegression.

In [None]:
# Evaluation the model using the confusion matrix to make sure nothing has changed.

X_cm = feature_store_train
y_true_labels = labels_train
model = lg

y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))

cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 

No values has changed with confusion matrix as well.

In [None]:
# Running the error rate on the LogisticRegression
FP = 51
FN = 459
TP = 2934
TN = 538
classification_error = (FP + FN) / float(TP + TN + FP + FN)

print(classification_error)

Same as before with the error rate the results are the same

##### Kneighbors classification model

#### Hyperparameter tuning

In [None]:
# Repeating the same steps but this time for Kneighbors parameters

params_KNN = {'n_neighbors': [1,2,3,4,5,6,7], 'p':[1,2,5]} #applying penalty in order to reduce overfitting
KN_model = KNeighborsClassifier()
clf = GridSearchCV(KN_model, params_KNN, cv=3, verbose = 1, n_jobs = -1)
bestF = clf.fit (feature_store_train, labels_train)

In [None]:
# extract the best parameters out of the hyperparameter

bestF.best_params_

In [None]:
KN = KNeighborsClassifier(n_neighbors = 6, p = 2)
KN.fit(feature_store_train, labels_train)

# Predict and evaluate the model using the accuracy metric
model_predictions = KN.predict(feature_store_test)
print('Accuracy', accuracy_score(labels_test, model_predictions))
print(metrics.classification_report(labels_test,model_predictions))

Slight improvement acheived regarding the accuracy score (was 0.81 and became 0.83), the recall for catagory 0 became 0.95 (was 0.91), the f1 score now is 0.90 (was 0.88). 
The precision score for the catagory 1 became 0.76 (was 0.65) while the recall became 0.46 (was 0.54), and the f1 score now is 0.57 (was 0.59) which is recording a slight decrease in these values. 

In [None]:
# Evaluating KNeighbors using the confusion matrix after the new results

X_cm = feature_store_train
y_true_labels = labels_train
model = KN

y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))

cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 

FP is 101 (was 177), TP is 2884 (was 2934), FN is 480 (was 459), TN is 517 (was 538.
A very slight increase with the values acheived

In [None]:
# Kneighbors misclassification rate
FP = 101
FN = 480
TP = 2884
TN = 517
classification_error = (FP + FN) / float(TP + TN + FP + FN)

print(classification_error)

The rate was 0.13 and became 0.14

In [None]:
# Saving the logisticRegression model using pickle
## https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
## https://www.youtube.com/watch?v=KfnhNlD8WZI&ab_channel=codebasics

import pickle

with open ('LogisticRegression_model', 'wb') as lgmodel:
    pickle.dump (lg, lgmodel)

In [None]:
# Saving the KNeighbors model using pickle
import pickle

with open ('KNeighbors_model', 'wb') as KNeighborsmodel:
    pickle.dump (KN, KNeighborsmodel)

In [None]:
# loading the logisticRegression model (the best model)

with open ('LogisticRegression_model', 'rb') as lgmodel:
    mp_lg_pkl = pickle.load (lgmodel)

In [None]:
# loading the logisticRegression model (the best model)

with open ('KNeighbors_model', 'rb') as KNeighborsmodel:
    mp_kn_pkl = pickle.load (KNeighborsmodel)

In [None]:
# merging train and valid data sets
## https://www.codegrepper.com/code-examples/python/How+to+Merge+train+and+Test+dataset+in+python

merged_dset=train_csv.append(valid_csv)
merged_dset.shape

In [None]:
merged_dset.head(10)

In [None]:
merged_dset.tail(10)

### Performing cross validation on the merged data sets (merged_dset)

In [None]:
# Splitting the dataset
X = merged_dset.loc[:,"tokenized_headlines"] # The data
y = merged_dset.loc[:,"category_encoded"] # the target

In [None]:
X_train_plus_valid, X_test, y_train_plus_valid, y_test = train_test_split(X, y, random_state=0, test_size = 0.30)

In [None]:
# Victorize and store all the data features

feature_train = vectorizer.fit_transform (X_train).toarray()
lab_train = y_train
feature_test = vectorizer.fit_transform (X_test).toarray()
lab_test = y_test

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kf = KFold(n_splits=5)
kf

In [None]:
# Running the cross validation on logisticsRegression saved model

In [None]:
mp_lg_pkl.fit(feature_train, lab_train)

In [None]:
## https://www.youtube.com/watch?v=gJo0uNL-5Qw&t=871s&ab_channel=codebasics
cross_val_score(mp_lg_pkl, feature_train, lab_train)

In [None]:
# Running the cross validation on KNeighbors

In [None]:
mp_kn_pkl.fit(feature_train, lab_train)

In [None]:
cross_val_score(mp_kn_pkl , feature_train, lab_train)

In [None]:
# WE can notice that the accuracy results of the KNeighbors model are lower than the ones of the logistics regression's 
# which also confirm's that the logistics regression is the best model for this dataset, the logistics regression achieved 0.87 before cross validation
# and after cross validation the result became 0.86, while the result of the KNeighbors was 0.85 and became 0.82 after cross validation.

### Applying the best model to the test.csv data set

In [None]:
# Loading the test dataset
test_csv = pd.read_csv('test.csv')

In [None]:
test_csv

In [None]:
test_csv['category_encoded'].value_counts()

In [None]:
test_csv.isnull().sum()

In [None]:
# Gettiing rid of the values with no meaning

test_csv[test_csv["headline"].str.contains("no_headline")==False]

In [None]:
# Drop all the 0 values from the "category_encoded" column
# I noticed that sometimes around 10 of the 0 values show up for the empty rows that I filled with some data to avoid keeping them empty. 
test_csv = test_csv[test_csv.category_encoded != 0]

In [None]:
# I already did the data cleaning on the file so I don't need to do it again


In [None]:
feature_store_test.shape

In [None]:
labels_test.shape

In [None]:
# Make sure the data set is not containing any null values
test_csv.isnull().sum()

In [None]:
feature_store_test

In [None]:
labels_test

In [None]:
# Applying the LogisticRegression model (from cross validation) saved model to the test.csv file

mp_lg_pkl.fit(feature_store_test, labels_test)
model_predictions = mp_lg_pkl.predict(feature_store_test)
print('Accuracy', accuracy_score(labels_test, model_predictions))
print(metrics.classification_report(labels_test,model_predictions))

In [None]:
# Applying the LogisticRegression saved model to the test.csv set
X_cm = feature_store_test
y_true_labels = labels_test
model = mp_lg_pkl

y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))

cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 

In [None]:
# The prediction model's (accuracy, recall,and the f1) results are quite similar to the validation ones, the only very slight differences were in the precision and recall (with a difference of 0.01 only)
# The confusion matrix results for validation were 0.87 for accuracy while the test's accuracy is 0.84, the recall and f1 results are a bit different for the validation set/category 1 (0.51 and 0.66 respectively) while they are 0.39 and 0.54 for the test set,
# which means the algorithm is not returniing enough of relevant results.
# however, the preceision is almost the same 0.95 and 0.90.which are still high which is good indication,
# it's returning more relevant resulta than the irrelevant ones.

### Applying the best classifier to the test set

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lg = LogisticRegression(C=1, penalty = 'l2')
lg.fit(feature_store_test, labels_test)

# Predict and evaluate the model using the accuracy metric
model_predictions = lg.predict(feature_store_test)
print('Accuracy', accuracy_score(labels_test, model_predictions))
print(metrics.classification_report(labels_test,model_predictions))

In [None]:
# Applying the confusion matrix on the test set

X_cm = feature_store_test
y_true_labels = labels_test
model = lg

y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))

cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 

In [None]:
# After applying the best model on the test set, it produced exactly the same results of the validation
# so no difference has happend. 