In [19]:
import pandas as pd
import re
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
import joblib
import nltk


In [26]:
# Load the dataset
train_data = pd.read_csv('dataset/twitter_training.csv', names=['id','info','sentiment','text'])
val_date= pd.read_csv('dataset/twitter_validation.csv', names=['id','info','sentiment','text'])
train_data


Unnamed: 0,id,info,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [27]:
train_data.sentiment.value_counts()

sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [32]:
train_data[train_data['sentiment']=='Irrelevant'].head(-5)

Unnamed: 0,id,info,sentiment,text
102,2418,Borderlands,Irrelevant,Appreciate the (sonic) concepts / praxis Valen...
103,2418,Borderlands,Irrelevant,Appreciate the (sound) concepts / practices th...
104,2418,Borderlands,Irrelevant,Evaluate the (sound) concepts / concepts of Va...
105,2418,Borderlands,Irrelevant,Appreciate the (sonic) concepts / praxis Valen...
106,2418,Borderlands,Irrelevant,Appreciate by the ( sonic ) electronic concept...
...,...,...,...,...
73994,9077,Nvidia,Irrelevant,"Oh ah, I worked with these guys. The second co..."
73995,9077,Nvidia,Irrelevant,"Oh, yeah, I used to work with these guys. Thir..."
73996,9077,Nvidia,Irrelevant,"Oh, yeah, although I always used to work mostl..."
73997,9077,Nvidia,Irrelevant,"Oh, yeah, I used to work under these guys. Tot..."


In [6]:
train_data.isnull().sum()

id             0
info           0
sentiment      0
text         686
dtype: int64

In [7]:
# drop null values
train_data.dropna(inplace= True)


In [8]:
# Check for duplicates
train_data.duplicated().sum()

2340

In [9]:
train_data.drop_duplicates(inplace= True)

In [10]:
def preprocess_text(text):
    # Convert to string and lowercase
    text = str(text).lower()

    # Remove URLs
    text = re.sub('[^A-Za-z0-9 ]+', ' ', text)
    return text

In [11]:
# Preprocess the text
train_data['pre_text'] = train_data['text'].apply(preprocess_text)
val_date['pre_text'] = val_date['text'].apply(preprocess_text)

In [13]:
bow_counts= CountVectorizer(
    tokenizer= word_tokenize,
    ngram_range=(1, 4)
)

In [14]:
reviews_train, reviews_test = train_test_split(train_data, test_size=0.2, random_state=0)

In [15]:
X_train_bow = bow_counts.fit_transform(reviews_train['pre_text'])
X_test_bow = bow_counts.transform(reviews_test['pre_text'])
X_val_bow = bow_counts.transform(val_date['pre_text'])



In [25]:
train_data[train_data['sentiment']=='positive'].head()

Unnamed: 0,id,info,sentiment,text,pre_text


In [16]:
y_train_bow = reviews_train.sentiment
y_test_bow = reviews_test.sentiment
y_val_bow = val_date.sentiment

In [17]:
le= LabelEncoder()
y_test_bow= le.fit_transform(y_test_bow)
y_train_bow= le.transform(y_train_bow)
y_val_bow= le.transform(y_val_bow)

In [21]:
LR_clf= LogisticRegression(C=0.9, solver="liblinear",max_iter=1500)
LR_clf.fit(X_train_bow, y_train_bow)

In [22]:
y_pred = LR_clf.predict(X_test_bow)
print("LogisticRegression Accuracy on Test: ", accuracy_score(y_test_bow, y_pred) * 100)
print("\nLogisticRegression Classification Report: \n", classification_report(y_test_bow, y_pred, target_names=le.classes_))

y_pred_val = LR_clf.predict(X_val_bow)
print("LogisticRegression Accuracy on Validation: ", accuracy_score(y_val_bow, y_pred_val) * 100)
print("\nLogisticRegression Classification Report: \n", classification_report(y_val_bow, y_pred_val, target_names=le.classes_))


LogisticRegression Accuracy on Test:  91.39687412782584

LogisticRegression Classification Report: 
               precision    recall  f1-score   support

  Irrelevant       0.97      0.86      0.91      2535
    Negative       0.91      0.94      0.92      4306
     Neutral       0.94      0.90      0.92      3568
    Positive       0.87      0.94      0.90      3923

    accuracy                           0.91     14332
   macro avg       0.92      0.91      0.91     14332
weighted avg       0.92      0.91      0.91     14332

LogisticRegression Accuracy on Validation:  98.7

LogisticRegression Classification Report: 
               precision    recall  f1-score   support

  Irrelevant       0.99      0.98      0.99       172
    Negative       0.99      0.99      0.99       266
     Neutral       0.99      0.99      0.99       285
    Positive       0.98      0.98      0.98       277

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99 

In [None]:
joblib.dump(LR_clf, 'model\\twitter_sentiment_model_LR.joblib')
joblib.dump(le, 'model\\label_encoder_LR.joblib')
joblib.dump(bow_counts, 'model\\bow.joblib')