In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

print("all libraries installed successfully")

all libraries installed successfully


In [2]:
# load dataset
df = pd.read_csv(r"tweets.csv", encoding='latin-1')
display(df.head())
print(df.info())
print(df.describe())

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 6 columns):
 #   Column                                                                                                               Non-Null Count    Dtype 
---  ------                                                                                                               --------------    ----- 
 0   0                                                                                                                    1599999 non-null  int64 
 1   1467810369                                                                                                           1599999 non-null  int64 
 2   Mon Apr 06 22:19:45 PDT 2009                                                                                         1599999 non-null  object
 3   NO_QUERY                                                                                                             1599999 non-null  object
 4   _

In [3]:
# since no columns are given so we are giving the names of the columns
cols = ['target', 'id', 'date', 'flag', 'user', 'text']
df = pd.read_csv(r'tweets.csv', encoding='latin-1', names=cols)



# now keeping onluy the features and the target variable
df = df[['target','text']]

# checking for null values
print(df.isnull().sum())

target    0
text      0
dtype: int64


In [4]:
# PREPROCESS
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [stemmer.stem(lemmatizer.lemmatize(t)) for t in tokens]
    return " ".join(tokens)

df['clean_text'] = df['text'].apply(preprocess_text)
    

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\azama\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\azama\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\azama\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# train test split
X = df['clean_text']
y = df['target']

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# feature extraction (bag of words)

vectorizer = CountVectorizer()
XtrainVec = vectorizer.fit_transform(Xtrain)
XtestVec = vectorizer.transform(Xtest)

In [7]:
# Naive Bayes Classifier
model = MultinomialNB()
model.fit(XtrainVec, ytrain)

yPred = model.predict(XtestVec)

print(classification_report(ytest, yPred))
print(confusion_matrix(ytest, yPred))

              precision    recall  f1-score   support

           0       0.75      0.80      0.78    159494
           4       0.79      0.73      0.76    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000

[[127938  31556]
 [ 42570 117936]]


In [31]:
# new sample testing 
sample = 'I can understand this project, it is good as well as bad'
sampleVec = vectorizer.transform([sample])
print("The prediction is: ", model.predict(sampleVec))
print("the accuracy is: ", model.score(sampleVec, [4]))
print("The probabilities are: ", model.predict_proba(sampleVec))

The prediction is:  [0]
the accuracy is:  0.0
The probabilities are:  [[0.85754556 0.14245444]]
