In [1]:
import pandas as pd
import numpy as np

In [2]:
import pandas as pd

file_path = r"C:\Users\Chinelo\Downloads\twitterdata.csv"

try:
    twitter_data = pd.read_csv(file_path, encoding='latin1')  
    print("Data imported successfully!")
    print(twitter_data.head())
except Exception as e:
    print(f"An error occurred: {e}")


Data imported successfully!
   ItemID  Sentiment                                      SentimentText
0       1          0                       is so sad for my APL frie...
1       2          0                     I missed the New Moon trail...
2       3          1                            omg its already 7:30 :O
3       4          0            .. Omgaga. Im sooo  im gunna CRy. I'...
4       5          0           i think mi bf is cheating on me!!!   ...


In [3]:
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score


In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chinelo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# printing the stopwords in english
# stopwords are anything that doesnt have any influencial meaning to my text data
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Data Processing

In [6]:
# check umber of columns and rows
twitter_data.shape

(99989, 3)

In [7]:
twitter_data.head(5)

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [8]:
# check for the nnumber of missing values if any
twitter_data.isnull().sum()

ItemID           0
Sentiment        0
SentimentText    0
dtype: int64

There are no missing values.

In [9]:
# check the distribution of target values which is the sentiment column
twitter_data['Sentiment'].value_counts()

Sentiment
1    56457
0    43532
Name: count, dtype: int64

Even though the distribution is slightly imbalanced, i will use it like that.

## Lemmatization
Lemmatization is the process of reducing a word to its root word. This will help reduce the dimension and complexity of the data.

In [10]:
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chinelo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

In [12]:
def lemmatization(text):
    # Clean the text: remove non-alphabetic characters and convert to lowercase
    lemmatized_text = re.sub('[^a-zA-Z]', ' ', text)
    lemmatized_text = lemmatized_text.lower()
    
    # Split the text into individual words
    lemmatized_text = lemmatized_text.split()
    
    # Lemmatize each word (ignore stopwords)
    lemmatized_text = [lemmatizer.lemmatize(word) for word in lemmatized_text if word not in stopwords.words('english')]
    
    # Join the lemmatized words back into a string
    lemmatized_text = ' '.join(lemmatized_text)
    
    return lemmatized_text

stemmed_text = re.sub('[^a-zA-Z]', ' ', text):
This removes all the characters in the tweet text column that is not small letters a-z and capitals letters A-Z. somit contains oly words ad no number or special characters.

stemmed_text = stemmed_text.lower():
This converts all the text column letters to lower case

stemmed_text = stemmed_text.split():
Here, we split all the words and put  them into a list

stemmed_text = [port_stem.stem(word) for word in stemmed_text if not word in stopwords.words('english')]
 Here, using port-stem, we reduce words to its root form so as to make processing easier.
 then we take each word in the stemmed text and find out if its a stopword. if it is, it will be removed and stemming wont be done on them.
 Remove the stopwords from the data inorder to remove more complexity from the data since its already large.

 stemmed_text = ' '.join(stemmed_text)
 Here, join all the stemmed words in the stemmed text back.
 

After, we will be left with stemmed tweets with words in its root form




In [14]:
# Apply lemmatization to the 'text' column and create a new 'lemmatized_text' column
twitter_data['lemmatized_text'] = twitter_data['SentimentText'].apply(lemmatization)

In [15]:
twitter_data.head(3)

Unnamed: 0,ItemID,Sentiment,SentimentText,lemmatized_text
0,1,0,is so sad for my APL frie...,sad apl friend
1,2,0,I missed the New Moon trail...,missed new moon trailer
2,3,1,omg its already 7:30 :O,omg already


In [16]:
print(twitter_data['lemmatized_text'])

0                                           sad apl friend
1                                  missed new moon trailer
2                                              omg already
3        omgaga im sooo im gunna cry dentist since supo...
4                                     think mi bf cheating
                               ...                        
99984    cupcake seems like repeating problem hope able...
99985    cupcake arrrr replied different tweet time see...
99986                                   cupcake ya thought
99987                      cupcake dollie yes yes glad fun
99988                               cupcake kayla haha yes
Name: lemmatized_text, Length: 99989, dtype: object


In [18]:
print(twitter_data['Sentiment'])

0        0
1        0
2        1
3        0
4        0
        ..
99984    0
99985    1
99986    0
99987    1
99988    1
Name: Sentiment, Length: 99989, dtype: int64


In [19]:
# seperate the the data and the label.
X = twitter_data['lemmatized_text'].values
Y = twitter_data['Sentiment'].values

In [20]:
X

array(['sad apl friend', 'missed new moon trailer', 'omg already', ...,
       'cupcake ya thought', 'cupcake dollie yes yes glad fun',
       'cupcake kayla haha yes'], shape=(99989,), dtype=object)

In [21]:
Y

array([0, 0, 1, ..., 0, 1, 1], shape=(99989,))

Split data into train and test data


In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state = 2)

# X_train and Y_train are linked as Y_train the coresponding label of X.
# X_test and Y_test are linked.

In [23]:
print(X.shape, X_train.shape, X_test.shape)

(99989,) (79991,) (19998,)


In [24]:
X_train

array(['craftyb mortgage bead mortgage bead hard decide',
       'ammarvellous never contest hug sweeetdreams hopefully leek night',
       'alexd xo hot jealous wish italy', ...,
       'claimtofame gasp oh happened feeling well back tomoz thou',
       'clairel yea sure haha weget go see coraline print boot omg happened',
       'angelous'], shape=(79991,), dtype=object)

In [25]:
X_test

array(['candiceshane saturday l help supposed behave',
       'beethequeenbee supper sick vicks chest ugh', 'atebits hah late',
       ..., 'angypangy yeah hoping amp reason',
       'fthawaiian yeah occurred tweeted one broken amp metal plate holding together hurt',
       'clairewhill trying listen show online nothing happening'],
      shape=(19998,), dtype=object)

In [26]:
# The model cant work with text data so we have to convert them to numerical data.
# So, feature extraction is done
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [27]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 595117 stored elements and shape (79991, 79719)>
  Coords	Values
  (0, 38710)	0.3433051768703893
  (0, 58450)	0.6323583967300984
  (0, 18979)	0.6215634591439957
  (0, 48441)	0.18303216421726134
  (0, 40749)	0.24984496540420023
  (1, 8043)	0.3910478305670075
  (1, 59667)	0.226477323769663
  (1, 37595)	0.3587771428334812
  (1, 49881)	0.2669047388105966
  (1, 71981)	0.4929859214022431
  (1, 49595)	0.2842982432308252
  (1, 54586)	0.47574899721722086
  (1, 59906)	0.2136181641375939
  (2, 4868)	0.566988081941857
  (2, 78661)	0.40848551629409313
  (2, 49684)	0.3370800207702877
  (2, 51952)	0.34473065478991594
  (2, 77988)	0.2703349387678716
  (2, 51453)	0.4540054559593216
  (3, 8402)	0.36869142385338594
  (3, 73394)	0.33770760959148766
  (3, 43507)	0.26193937314391325
  (3, 77715)	0.23010305005797294
  (3, 76434)	0.21107991001050688
  (3, 44717)	0.26868498463528107
  :	:
  (79987, 62096)	0.311530303684509
  (79987, 35317)	0.3499556

In [28]:
print(X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 135910 stored elements and shape (19998, 79719)>
  Coords	Values
  (0, 19814)	0.5377486578186047
  (0, 28837)	0.5809568301238395
  (0, 48936)	0.30478505224485114
  (0, 67517)	0.3704257437265125
  (0, 71762)	0.37842619208970335
  (1, 32962)	0.4504873899575669
  (1, 68914)	0.31013209126665653
  (1, 71745)	0.5070756106206824
  (1, 75330)	0.3347616403911396
  (1, 76413)	0.5759237507622729
  (2, 14886)	0.6585828712332321
  (2, 48158)	0.5868413754586665
  (2, 54325)	0.4710475578622316
  (3, 46035)	0.45051679624980834
  (3, 48718)	0.33043208337560226
  (3, 55855)	0.5432186819212088
  (3, 56079)	0.45128114886761866
  (3, 58212)	0.24391176829313557
  (3, 73954)	0.21240772923681167
  (3, 77381)	0.1740659176817745
  (3, 78908)	0.23280701237624526
  (4, 19578)	0.4130217727202061
  (4, 40559)	0.15356009567488627
  (4, 42436)	0.31069558335951514
  (4, 44001)	0.23562208784164226
  :	:
  (19994, 63133)	0.23546879992983222
  (19994, 65861)	0

##  Training the machine learning model

In [29]:

# logistic regression 
model = LogisticRegression(max_iter=1000)


In [30]:
model.fit(X_train, Y_train)

Model Evaluation

accuracy score

In [32]:
from sklearn.metrics import accuracy_score


# accuracy_score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [33]:
print('Accuracy score of the training data:', training_data_accuracy)

Accuracy score of the training data: 0.8283556900151267


In [34]:
# accuracy_score prediction on the test data using logistics regression
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)


# Accuracy score
print('Accuracy score of the testdata:', test_data_accuracy)

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Precision, Recall, and F1-Score
precision = precision_score(Y_test, X_test_prediction)
recall = recall_score(Y_test, X_test_prediction)
f1 = f1_score(Y_test, X_test_prediction)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')


Accuracy score of the testdata: 0.7544754475447545
Precision: 0.7582133031234828
Recall: 0.8297910024796316
F1-Score: 0.7923890063424948


 ## Model Performance Analysis
 
Accuracy (75.45%)
The model's overall correctness is 75.45%, meaning about three-quarters of the predictions are correct. This is a decent result for a sentiment analysis task.

Precision (75.82%)
Of all the instances predicted as positive, 75.82% are actually positive. This indicates the model is reasonably good at avoiding false positives (predicting positive sentiment when it’s actually negative).

Recall (82.98%)
With a recall of 82.98%, the model identifies the majority of positive sentiment cases correctly. This highlights the model's ability to capture most of the true positive instances.

F1-Score (79.24%)
The F1-Score, which balances Precision and Recall, is 79.24%. This reflects a strong trade-off between capturing true positives and avoiding false positives, indicating the model is reliable for practical use.

Summary
The model demonstrates solid performance across key metrics:

High Recall shows it effectively captures positive sentiment.
Good Precision indicates minimal false positives.
A strong F1-Score reflects a balanced trade-off between Precision and Recall.
While Accuracy is slightly lower than Recall, the model’s higher Recall ensures it captures most positive sentiment cases, making it suitable for tasks prioritizing sensitivity to positive sentiment detection.

## test/train accuracy

The gap between training accuracy (82.84%) and test accuracy (75.45%) is approximately 7.39%.
This suggests some level of overfitting, where the model fits the training data better than it generalizes to new data.

## Decision Tree Classifier

In [35]:
from sklearn.tree import DecisionTreeClassifier

# Initialize Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, Y_train)

In [36]:
# Predictions and accuracy score for the training data
X_train_prediction_dt = dt_model.predict(X_train)
training_data_accuracy_dt = accuracy_score(Y_train, X_train_prediction_dt)
print('Accuracy score of the training data (Decision Tree):', training_data_accuracy_dt)

Accuracy score of the training data (Decision Tree): 0.9993249240539561


In [37]:
# Predictions and accuracy score for the test data
X_test_prediction_dt = dt_model.predict(X_test)
test_data_accuracy_dt = accuracy_score(Y_test, X_test_prediction_dt)
print('Accuracy score of the test data (Decision Tree):', test_data_accuracy_dt)

# Precision, Recall, and F1-Score for Decision Tree
precision_dt = precision_score(Y_test, X_test_prediction_dt)
recall_dt = recall_score(Y_test, X_test_prediction_dt)
f1_dt = f1_score(Y_test, X_test_prediction_dt)

print(f'Precision (Decision Tree): {precision_dt}')
print(f'Recall (Decision Tree): {recall_dt}')
print(f'F1-Score (Decision Tree): {f1_dt}')

Accuracy score of the test data (Decision Tree): 0.6796679667966796
Precision (Decision Tree): 0.7270024159078238
Recall (Decision Tree): 0.6928799149840595
F1-Score (Decision Tree): 0.7095311508116442


## Analysis

Training Accuracy (99.93%)
Extremely high accuracy on the training data suggests that the Decision Tree has overfit to the training set.Decision Trees are prone to overfitting, especially when they grow too deep and memorize the training data instead of learning generalized patterns.


Test Accuracy (67.97%)
A significantly lower test accuracy compared to the training accuracy reflects poor generalization to unseen data. This large discrepancy between training and test accuracy indicates the model struggles with new inputs.


Precision (72.70%)
Of all the instances predicted as positive, 72.70% are actually positive.
While the precision is acceptable, it is not exceptional, considering the gap in accuracy scores.


Recall (69.29%)
The model captures 69.29% of the true positive cases. The Recall is slightly lower than Precision, meaning the model is missing some positive cases (false negatives).


F1-Score (70.95%)
The F1-Score provides a balance between Precision and Recall.
This score indicates moderate performance but highlights that the model is far from optimal.


# Key Observations

Overfitting:

The Decision Tree overfits the training data, achieving near-perfect accuracy but failing to generalize to test data.
The test accuracy (67.97%) is far lower than other models you've tested, such as Logistic Regression or Random Forest, suggesting the Decision Tree is not the best choice here.
Generalization Issues:

The model struggles to perform consistently across training and test datasets, likely due to excessive depth or lack of pruning.


## Random Forest Classifier

In [38]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, Y_train)



In [39]:
# Predictions and accuracy score for the training data
X_train_prediction_rf = rf_model.predict(X_train)
training_data_accuracy_rf = accuracy_score(Y_train, X_train_prediction_rf)
print('Accuracy score of the training data (Random Forest):', training_data_accuracy_rf)

Accuracy score of the training data (Random Forest): 0.9993249240539561


In [40]:
# Predictions and accuracy score for the test data
X_test_prediction_rf = rf_model.predict(X_test)
test_data_accuracy_rf = accuracy_score(Y_test, X_test_prediction_rf)
print('Accuracy score of the test data (Random Forest):', test_data_accuracy_rf)

# Precision, Recall, and F1-Score for Random Forest
precision_rf = precision_score(Y_test, X_test_prediction_rf)
recall_rf = recall_score(Y_test, X_test_prediction_rf)
f1_rf = f1_score(Y_test, X_test_prediction_rf)

print(f'Precision (Random Forest): {precision_rf}')
print(f'Recall (Random Forest): {recall_rf}')
print(f'F1-Score (Random Forest): {f1_rf}')

Accuracy score of the test data (Random Forest): 0.7350735073507351
Precision (Random Forest): 0.7569444444444444
Recall (Random Forest): 0.7818809776833157
F1-Score (Random Forest): 0.7692106638787245


# Analysis

Training Accuracy: 0.9993 — This suggests that the model fits the training data almost perfectly, which might be a sign of overfitting if the test performance is lower.

Test Accuracy: 0.7351 — A substantial drop from training accuracy, suggesting that the model is not generalizing well to unseen data.

Precision, Recall, and F1-Score

Precision: 0.7569 — This means that when the model predicts a positive class, it's correct 75.69% of the time.

Recall: 0.7819 — This shows that the model identifies 78.19% of the actual positive class instances.

F1-Score: 0.7692 — This is the harmonic mean of precision and recall, providing a balanced measure of performance.

## Support Vector Machine (SVM)

In [41]:
from sklearn.svm import SVC

# Initialize support vector machine model
svm_model = SVC()
svm_model.fit(X_train, Y_train)

In [42]:
# Predictions and accuracy score for the training data
X_train_prediction_svm = svm_model.predict(X_train)
training_data_accuracy_svm = accuracy_score(Y_train, X_train_prediction_svm)
print('Accuracy score of the training data (SVM):', training_data_accuracy_svm)


Accuracy score of the training data (SVM): 0.9593079221412409


In [43]:
# Predictions and accuracy score for the test data
X_test_prediction_svm = svm_model.predict(X_test)
test_data_accuracy_svm = accuracy_score(Y_test, X_test_prediction_svm)
print('Accuracy score of the test data (SVM):', test_data_accuracy_svm)

# Precision, Recall, and F1-Score for SVM
precision_svm = precision_score(Y_test, X_test_prediction_svm)
recall_svm = recall_score(Y_test, X_test_prediction_svm)
f1_svm = f1_score(Y_test, X_test_prediction_svm)

print(f'Precision (SVM): {precision_svm}')
print(f'Recall (SVM): {recall_svm}')
print(f'F1-Score (SVM): {f1_svm}')

Accuracy score of the test data (SVM): 0.7565756575657566
Precision (SVM): 0.7605028386050284
Recall (SVM): 0.8304109103790294
F1-Score (SVM): 0.7939209211751757


# Metrics Overview
Training Accuracy (SVM): 0.9593 — This indicates a good fit on the training data, with less overfitting compared to the Random Forest model.

Test Accuracy (SVM): 0.7566 — A modest improvement over the Random Forest's test accuracy of 0.7351, showing better generalization.

Precision, Recall, and F1-Score
Precision: 0.7605 — When the model predicts the positive class, it is correct 76.05% of the time. Slightly better than the Random Forest's 75.69%.

Recall: 0.8304 — The model captures 83.04% of all actual positive instances, significantly better than the Random Forest's 78.19%.

F1-Score: 0.7939 — A higher harmonic mean of precision and recall compared to Random Forest, indicating a better balance between the two.

Analysis

The SVM model's performance suggests that it is better at capturing the patterns in the data while maintaining a reasonable tradeoff between precision and recall. However, the test accuracy (75.66%) indicates there is still room for improvement.

# K-Nearest Neighbors (KNN)

In [44]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize support vector machine model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, Y_train)

In [45]:
# Predictions and accuracy score for the training data
X_train_prediction_knn = knn_model.predict(X_train)
training_data_accuracy_knn = accuracy_score(Y_train, X_train_prediction_knn)
print('Accuracy score of the training data (KNN):', training_data_accuracy_knn)


Accuracy score of the training data (KNN): 0.6573989573827056


In [46]:
# Predictions and accuracy score for the test data
X_test_prediction_knn = knn_model.predict(X_test)
test_data_accuracy_knn = accuracy_score(Y_test, X_test_prediction_knn)
print('Accuracy score of the test data (KNN):', test_data_accuracy_knn)

# Precision, Recall, and F1-Score for KNN
precision_knn = precision_score(Y_test, X_test_prediction_knn)
recall_knn = recall_score(Y_test, X_test_prediction_knn)
f1_knn = f1_score(Y_test, X_test_prediction_knn)

print(f'Precision (KNN): {precision_knn}')
print(f'Recall (KNN): {recall_knn}')
print(f'F1-Score (KNN): {f1_knn}')

Accuracy score of the test data (KNN): 0.5065506550655066
Precision (KNN): 0.6325884543761638
Recall (KNN): 0.3008324477506199
F1-Score (KNN): 0.407754171167927


# Metrics Overview

Training Accuracy (KNN): 0.6574 — The model doesn't perform very well even on the training data, suggesting that it struggles to fit the data adequately.

Test Accuracy (KNN): 0.5066 — This is only slightly better than random guessing for a binary classification task (baseline ~50%).

Precision, Recall, and F1-Score

Precision: 0.6326 — When the model predicts the positive class, it is correct 63.26% of the time, but this is not balanced with recall.

Recall: 0.3008 — The model identifies only 30.08% of the actual positive cases, which is quite low.

F1-Score: 0.4078 — A low harmonic mean of precision and recall reflects an imbalanced performance.

Analysis

The low accuracy, recall, and F1-score suggest that the KNN model is not performing well on your dataset. 

The following factors might contribute to this:

High Dimensionality: KNN struggles in high-dimensional spaces due to the "curse of dimensionality." Distances become less meaningful as the number of features increases.
Inappropriate Choice of k: The value of k (number of neighbors) may not be optimal for your dataset.
Data Scaling: KNN relies heavily on distance metrics, so if your features aren't scaled, the model might not perform well.


# Gradient Boosting

In [48]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier()

# Train the model on training data
gb_model.fit(X_train, Y_train)



In [49]:
# Make predictions on the training data
X_train_prediction = gb_model.predict(X_train)
print('Gradient Boosting - Training Accuracy:', accuracy_score(Y_train, X_train_prediction))



Gradient Boosting - Training Accuracy: 0.680339038141791


In [50]:
# Make predictions on the test data
X_test_prediction = gb_model.predict(X_test)
print('Gradient Boosting - Test Accuracy:', accuracy_score(Y_test, X_test_prediction))

# Calculate precision, recall, and F1-Score
print('Gradient Boosting - Precision:', precision_score(Y_test, X_test_prediction))
print('Gradient Boosting - Recall:', recall_score(Y_test, X_test_prediction))
print('Gradient Boosting - F1-Score:', f1_score(Y_test, X_test_prediction))


Gradient Boosting - Test Accuracy: 0.6758675867586759
Gradient Boosting - Precision: 0.6471127966723759
Gradient Boosting - Recall: 0.9368579525327666
Gradient Boosting - F1-Score: 0.7654848046309696


# Metrics Overview

Training Accuracy: 0.6803
Indicates that the model has learned moderately well from the training data.

Test Accuracy: 0.6759
The test accuracy is very close to the training accuracy, suggesting the model generalizes well without overfitting.

Precision, Recall, and F1-Score
Precision: 0.6471
Indicates that 64.71% of the positive predictions were correct.

Recall: 0.9369
The recall is excellent, meaning the model captures 93.69% of all actual positive instances. This makes it particularly useful for scenarios where missing a positive instance is costly.

F1-Score: 0.7655
A good balance between precision and recall, indicating strong overall performance.

Analysis

The Gradient Boosting model performs better than the KNN model and has strengths that make it competitive with the Random Forest and SVM models:

High Recall: Suggests it is particularly good at identifying positive instances, which might be critical for your use case.

Balanced Generalization: The closeness of training and test accuracy indicates that the model is neither overfitting nor underfitting.


# XGBoost

In [58]:
from xgboost import XGBClassifier

# Initialize the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Train the model on training data
xgb_model.fit(X_train, Y_train)



Parameters: { "use_label_encoder" } are not used.



AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, random_state=None, ...)

In [59]:
# Make predictions on the training data
X_train_prediction = xgb_model.predict(X_train)
print('XGBoost - Training Accuracy:', accuracy_score(Y_train, X_train_prediction))



XGBoost - Training Accuracy: 0.7478341313397757


In [60]:
# Make predictions on the test data
X_test_prediction = xgb_model.predict(X_test)
print('XGBoost - Test Accuracy:', accuracy_score(Y_test, X_test_prediction))

# Calculate precision, recall, and F1-Score
print('XGBoost - Precision:', precision_score(Y_test, X_test_prediction))
print('XGBoost - Recall:', recall_score(Y_test, X_test_prediction))
print('XGBoost - F1-Score:', f1_score(Y_test, X_test_prediction))


XGBoost - Test Accuracy: 0.7204220422042205
XGBoost - Precision: 0.7007535741953659
XGBoost - Recall: 0.8811547998583068
XGBoost - F1-Score: 0.7806676866344985


# Metrics Overview

Training Accuracy: 0.7478
The model fits the training data well, with a relatively high accuracy.

Test Accuracy: 0.7204
The test accuracy is slightly lower than the training accuracy, indicating that the model generalizes well to new data without significant overfitting.

Precision, Recall, and F1-Score
Precision: 0.7008
The model is correct 70.08% of the time when predicting the positive class. This is quite good and suggests that it doesn't make too many false positives.

Recall: 0.8812
The recall is excellent, capturing 88.12% of the actual positive instances. This is a strong point for XGBoost, especially in cases where catching all positives is important.

F1-Score: 0.7807
A good balance between precision and recall, making the model effective in both minimizing false positives and false negatives.

Analysis

Good Balance Between Precision and Recall: XGBoost shows a higher recall than precision, indicating it’s better at identifying the positive cases, which may be crucial depending on your specific use case (e.g., identifying fraud, detecting diseases).

Generalization: The slight difference between the training and test accuracies suggests that XGBoost is performing well without overfitting the data.

Strengths of XGBoost:

Strong Recall: The high recall value indicates that the model is very effective at detecting the positive class.

Overall Performance: The F1-Score shows that XGBoost strikes a good balance between precision and recall, making it a robust model for various applications.


In [61]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate models and return a dictionary of metrics
def evaluate_model(model, X_train, Y_train, X_test, Y_test):
    # Make predictions on the training data
    X_train_prediction = model.predict(X_train)
    # Make predictions on the test data
    X_test_prediction = model.predict(X_test)
    
    # Calculate the metrics for training and test data
    metrics = {
        'Model': model.__class__.__name__,
        'Training Accuracy': accuracy_score(Y_train, X_train_prediction),
        'Test Accuracy': accuracy_score(Y_test, X_test_prediction),
        'Precision': precision_score(Y_test, X_test_prediction),
        'Recall': recall_score(Y_test, X_test_prediction),
        'F1-Score': f1_score(Y_test, X_test_prediction)
    }
    
    return metrics

# List of models to evaluate (you can add more models here)
models = [
    dt_model,  # Decision Tree
    rf_model,  # Random Forest
    model,  # Logistic Regression
    svm_model, # SVM
    knn_model, # KNN
    
    gb_model,  # Gradient Boosting
    xgb_model  # XGBoost
]

# List to store the metrics for each model
results = []

# Evaluate each model and store the results
for model in models:
    results.append(evaluate_model(model, X_train, Y_train, X_test, Y_test))

# Convert the results into a DataFrame for better visualization
metrics_df = pd.DataFrame(results)

# Display the metrics in a tabular form
print(metrics_df)


                        Model  Training Accuracy  Test Accuracy  Precision  \
0      DecisionTreeClassifier           0.999325       0.679668   0.727002   
1      RandomForestClassifier           0.999325       0.735074   0.756944   
2          LogisticRegression           0.828356       0.754475   0.758213   
3                         SVC           0.959308       0.756576   0.760503   
4        KNeighborsClassifier           0.657399       0.506551   0.632588   
5  GradientBoostingClassifier           0.680339       0.675868   0.647113   
6               XGBClassifier           0.747834       0.720422   0.700754   

     Recall  F1-Score  
0  0.692880  0.709531  
1  0.781881  0.769211  
2  0.829791  0.792389  
3  0.830411  0.793921  
4  0.300832  0.407754  
5  0.936858  0.765485  
6  0.881155  0.780668  
