In [6]:
import pandas as pd
import re
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [2]:
# Loading the dataset
df = pd.read_csv(r"C:\Users\muham\Downloads\archive (3)\tweets.csv")
print(df.head())

      author                                            content country  \
0  katyperry  Is history repeating itself...?#DONTNORMALIZEH...     NaN   
1  katyperry  @barackobama Thank you for your incredible gra...     NaN   
2  katyperry                Life goals. https://t.co/XIn1qKMKQl     NaN   
3  katyperry            Me right now üôèüèª https://t.co/gW55C1wrwd     NaN   
4  katyperry  SISTERS ARE DOIN' IT FOR THEMSELVES! üôåüèªüí™üèª‚ù§Ô∏è ht...     NaN   

          date_time            id language  latitude  longitude  \
0  12/01/2017 19:52  8.196330e+17       en       NaN        NaN   
1  11/01/2017 08:38  8.191010e+17       en       NaN        NaN   
2  11/01/2017 02:52  8.190140e+17       en       NaN        NaN   
3  11/01/2017 02:44  8.190120e+17       en       NaN        NaN   
4  10/01/2017 05:22  8.186890e+17       en       NaN        NaN   

   number_of_likes  number_of_shares  
0             7900              3472  
1             3689              1380  
2      

Cleaning the mails

In [3]:
def clean_content(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]","",text)
    return text
df["clean_content"] = df["content"].apply(clean_content)
print(df[["content","clean_content"]].head())

                                             content  \
0  Is history repeating itself...?#DONTNORMALIZEH...   
1  @barackobama Thank you for your incredible gra...   
2                Life goals. https://t.co/XIn1qKMKQl   
3            Me right now üôèüèª https://t.co/gW55C1wrwd   
4  SISTERS ARE DOIN' IT FOR THEMSELVES! üôåüèªüí™üèª‚ù§Ô∏è ht...   

                                       clean_content  
0  is history repeating itselfdontnormalizehate h...  
1  barackobama thank you for your incredible grac...  
2                       life goals httpstcoxinqkmkql  
3                      me right now  httpstcogwcwrwd  
4  sisters are doin it for themselves  httpstcosh...  


In [7]:
!pip install vaderSentiment

Collecting vaderSentiment
  Using cached vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Using cached vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [12]:
# Using a pre-trained model and assigning the values to it

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

df["sentiment_score"] = df["clean_content"].apply(
    lambda x: analyzer.polarity_scores(x)["compound"]
)

def classify(score):
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"
df["sentiment"] = df["sentiment_score"].apply(classify)

df["sentiment_label"] = df["sentiment"].map({"Positive":2, "Neutral":1, "Negative":0})
print(df[["sentiment", "sentiment_label"]].head())

  sentiment  sentiment_label
0   Neutral                1
1  Positive                2
2   Neutral                1
3   Neutral                1
4   Neutral                1


In [9]:
# Splitting data for cleanong and testing 

X = df["clean_content"]
y = df["sentiment_label"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25,random_state=42)
print("training sample: ",len(X_train))
print("Test sample: ",len(X_test))

training sample:  39406
Test sample:  13136


In [10]:
# Converting text into numbers
vectorizer = TfidfVectorizer(stop_words="english",max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [11]:
# Training a model and evaluating it's performance

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec,y_train)

y_pred = model.predict(X_test_vec)
print("Accuracy: ",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy:  0.8861906211936662
              precision    recall  f1-score   support

           0       0.84      0.27      0.41       456
           1       0.88      0.97      0.92      8702
           2       0.91      0.78      0.84      3978

    accuracy                           0.89     13136
   macro avg       0.88      0.67      0.72     13136
weighted avg       0.89      0.89      0.88     13136



In [14]:
#Testing the model
sample_mail = [
    "Hey, I have a good and excellent and fantastic news for you",
    "Hello, there is a bad news",
    "Hi, have you done your work yet?"
]

sample_vec = vectorizer.transform(sample_mail)
sample_pred = model.predict(sample_vec)

label_map = {2: "Positive", 1: "Neutral", 0: "Negative"}

for mail, pred in zip(sample_mail, sample_pred):
    print("Mail:", mail)
    print("Prediction:", label_map[pred])
    print("----------------------------------------------------")



Mail: Hey, I have a good and excellent and fantastic news for you
Prediction: Positive
----------------------------------------------------
Mail: Hello, there is a bad news
Prediction: Negative
----------------------------------------------------
Mail: Hi, have you done your work yet?
Prediction: Neutral
----------------------------------------------------
