# IMPLEMENTATION

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pylab as pl  
import plotly.express as px

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
from other import tools

In [4]:
df_rating=pd.read_csv('https://raw.githubusercontent.com/GDLPLearning/Sentiment-Analysis-for-the-MDP/master/notebooks/Exploratory/data/sentiment.csv')

In [5]:
df_rating.head()

Unnamed: 0,full_text,sentiment
0,@camipaisa1 @Marbelle30 Soy antioqueño de nac...,0
1,@AlejaLoC Medellin lastimosamente a vivido en ...,0
2,Señora acomplejada nosotros los paisas no la ...,0
3,@cocoa_nini @diamaov La cultura traqueta de M...,0
4,@PinoCalad Los efectos de la cultura traqueta ...,0


In [6]:
# clean the tweet and save it on a new column

def clean_tweet(tweet):

  new_tweet = ''
  words = tweet.split()
  for w in words:
    wl = w.lower() # word in lower case
    
    # replace numbers
    for number, new_value in tools.replace_numbers_dict.items():
      wl = str(wl).replace(number,new_value)

    # replace some punctuations mark to keep only words
    for punctuation, new_value in tools.replace_punctuation_dict.items():
      wl = wl.replace(punctuation,new_value)
    
    # replace accent mark 
    for accent, new_value in tools.replace_accent_dict.items():
      wl = wl.replace(accent,new_value)

    if wl not in tools.model_stop_wprds:
      new_tweet += f'{wl} '
  
  return new_tweet[:-1]

In [7]:
df_rating['tweet'] = df_rating['full_text'].apply(clean_tweet)

In [8]:
df_rating.head(10)

Unnamed: 0,full_text,sentiment,tweet
0,@camipaisa1 @Marbelle30 Soy antioqueño de nac...,0,camipaisa marbelle antioqueño nacimiento verda...
1,@AlejaLoC Medellin lastimosamente a vivido en ...,0,alejaloc medellin lastimosamente vivido cultur...
2,Señora acomplejada nosotros los paisas no la ...,0,señora acomplejada nosotros paisas queremos me...
3,@cocoa_nini @diamaov La cultura traqueta de M...,0,cocoanini diamaov cultura traqueta medellin se...
4,@PinoCalad Los efectos de la cultura traqueta ...,0,pinocalad efectos cultura traqueta dejo escoba...
5,@karinin7986 Soy de Medellín y la cultura pais...,0,karinin medellin cultura paisa asco verguenza
6,Para nadie es un secreto que la sociedad Pais...,0,nadie secreto sociedad paisa medellin valle ab...
7,Esa cultura traqueta que carcome a medellín y ...,0,cultura traqueta carcome medellin aun sigue vi...
8,"cansadita de la gente de Medellín, y la cultu...",0,cansadita gente medellin cultura paisa
9,@nocontextSebs En Medellín no están lejos de p...,0,nocontextsebs medellin lejos pensar superiorid...


In [9]:
# make a copy of the data
df_ml = df_rating.copy()

In [10]:
# split the data into train and test data
tweets_train, tweets_test = train_test_split(df_rating, test_size=0.2, random_state=0)

In [11]:
# creating the instance of CountVectorizer() to use it in the dataset
vectorizer = CountVectorizer()

In [12]:
x_train_bow = vectorizer.fit_transform(tweets_train['tweet'])
x_test_bow = vectorizer.transform(tweets_test['tweet'])

In [13]:
y_train_bow = tweets_train['sentiment']
y_test_bow = tweets_test['sentiment']

In [14]:
# percentage of positive and negative reviews
y_test_bow.value_counts() / y_test_bow.shape[0]

0    0.611465
1    0.388535
Name: sentiment, dtype: float64

In [15]:
# Training the model 
lr_model_all = LogisticRegression(C=1, solver="liblinear")
lr_model_all.fit(x_train_bow, y_train_bow)
# Predicting the output
test_pred_lr_prob = lr_model_all.predict_proba(x_test_bow)
test_pred_lr_all = lr_model_all.predict(x_test_bow)

In [21]:
text=input("Enter a tweet: ")
text=clean_tweet(text)
text=vectorizer.transform([text])
text=text.toarray()
text=text.reshape(1,-1)
prediction=lr_model_all.predict(text)[0]
probability=lr_model_all.predict_proba(text)[0,1]
if prediction==0:
    print("The sentiment of the tweet is negative")
    print("The probability of the tweet being negative is",-100*(2*probability-1),"%")
else:
    print("The sentiment of the tweet is positive")
    print("The probability of the tweet being positive is",100*(2*probability-1),"%")

The sentiment of the tweet is negative
The probability of the tweet being negative is 6.030574322319115 %
