# Spacy

In [1]:
import pandas as pd
import spacy

In [4]:
nlp = spacy.load("en_core_web_sm")

In [None]:
def text_preprocessor(txt):
    txt = nlp(txt)
    txt = " ".join([i.lemma_ for i in txt if not i.is_stop and not i.is_punct])
    return txt

In [None]:
text = "I love programming in Python! It's amazing, and I learn new things every day."
print(text_preprocessor(text))

love program Python amazing learn new thing day


# Tweets

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import classification_report

from imblearn.over_sampling import RandomOverSampler

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [None]:
df = pd.read_csv(r"DataSets\Tweets.csv")
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [None]:
df.dropna(inplace=True)

In [None]:
df_preprocessed = df.drop(columns=["textID"])
df_preprocessed["preprocessed"] = df_preprocessed["selected_text"].apply(func=text_preprocessor)
df_preprocessed

Unnamed: 0,text,selected_text,sentiment,preprocessed
0,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,I`d respond go
1,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,Sooo SAD
2,my boss is bullying me...,bullying me,negative,bully
3,what interview! leave me alone,leave me alone,negative,leave
4,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son
...,...,...,...,...
27476,wish we could come see u on Denver husband l...,d lost,negative,d lose
27477,I`ve wondered about rake to. The client has ...,", don`t force",negative,don`t force
27478,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good
27479,But it was worth it ****.,But it was worth it ****.,positive,worth


In [None]:
cv = CountVectorizer()
x_vector = cv.fit_transform(df_preprocessed["preprocessed"])

In [None]:
df_preprocessed["sentiment"] = df_preprocessed["sentiment"].replace(["neutral","positive","negative"],[0,1,2])
df_preprocessed

  df_preprocessed["sentiment"] = df_preprocessed["sentiment"].replace(["neutral","positive","negative"],[0,1,2])


Unnamed: 0,text,selected_text,sentiment,preprocessed
0,"I`d have responded, if I were going","I`d have responded, if I were going",0,I`d respond go
1,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,2,Sooo SAD
2,my boss is bullying me...,bullying me,2,bully
3,what interview! leave me alone,leave me alone,2,leave
4,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",2,son
...,...,...,...,...
27476,wish we could come see u on Denver husband l...,d lost,2,d lose
27477,I`ve wondered about rake to. The client has ...,", don`t force",2,don`t force
27478,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,1,yay good
27479,But it was worth it ****.,But it was worth it ****.,1,worth


In [None]:
y = df_preprocessed["sentiment"]
y.value_counts()

sentiment
0    11117
1     8582
2     7781
Name: count, dtype: int64

In [None]:
ros = RandomOverSampler()
x_res,y_res = ros.fit_resample(x_vector,y)
y_res.value_counts()

sentiment
0    11117
2    11117
1    11117
Name: count, dtype: int64

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_vector,y,test_size=0.3,random_state=3)

In [None]:
models = [MultinomialNB(),SVC()]

for i in models:
    print(i)
    i.fit(x_train,y_train)
    y_pred = i.predict(x_test)
    print(classification_report(y_test,y_pred))
    print(cross_val_score(i,x_train,y_train))
    print(cross_val_score(i,x_train,y_train).mean())
    print()

MultinomialNB()
              precision    recall  f1-score   support

           0       0.71      0.82      0.76      3313
           1       0.79      0.80      0.79      2572
           2       0.82      0.62      0.70      2359

    accuracy                           0.76      8244
   macro avg       0.77      0.75      0.75      8244
weighted avg       0.76      0.76      0.75      8244

[0.75025988 0.75071484 0.75903301 0.76241227 0.74967507]
0.7544190143033397

SVC()
              precision    recall  f1-score   support

           0       0.80      0.86      0.83      3313
           1       0.90      0.77      0.83      2572
           2       0.75      0.78      0.77      2359

    accuracy                           0.81      8244
   macro avg       0.82      0.80      0.81      8244
weighted avg       0.82      0.81      0.81      8244

[0.80769231 0.81102158 0.81232129 0.79906421 0.80478295]
0.8069764651776609



In [None]:
svc = SVC()
svc.fit(x_train,y_train)
y_pred = svc.predict(x_test)
print(classification_report(y_test,y_pred))
print(cross_val_score(svc,x_train,y_train))
print(cross_val_score(svc,x_train,y_train).mean())

              precision    recall  f1-score   support

           0       0.80      0.86      0.83      3313
           1       0.90      0.77      0.83      2572
           2       0.75      0.78      0.77      2359

    accuracy                           0.81      8244
   macro avg       0.82      0.80      0.81      8244
weighted avg       0.82      0.81      0.81      8244

[0.80769231 0.81102158 0.81232129 0.79906421 0.80478295]
0.8069764651776609


In [None]:
def sentiment_predict(txt):
    txt = text_preprocessor(txt)
    vector = cv.transform([txt])
    pred = svc.predict(vector)
    if pred == 0:
        return "neutral"
    elif pred == 1:
        return "positive"
    else:
        return "negative"

In [None]:
text = "I need to check my email later."
print(sentiment_predict(text))

neutral


In [None]:
text = "I had an amazing time at the concert last night!"
print(sentiment_predict(text))

positive


In [None]:
text = "I am feeling really sick today."
print(sentiment_predict(text))

negative
