In [1]:
import pandas as pd 
import sklearn
import emoji
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [6]:
df = pd.read_csv('data/twitter_training.csv')
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [7]:
df.columns

Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')

In [None]:
df = df.rename(columns={'Positive': 'label', 'im getting on borderlands and i will murder you all ,': 'text'})

Unnamed: 0,2401,Borderlands,label,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [13]:
df = df[['label','text']]
df


Unnamed: 0,label,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
74676,Positive,Just realized that the Windows partition of my...
74677,Positive,Just realized that my Mac window partition is ...
74678,Positive,Just realized the windows partition of my Mac ...
74679,Positive,Just realized between the windows partition of...


In [15]:
df.label.value_counts()

label
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [24]:
df.isna().sum()

label      0
text     686
dtype: int64

In [25]:
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [32]:
min_size = df['label'].value_counts().min()

df_downsampled = df.groupby('label').apply(lambda x: x.sample(min_size, random_state=42)).reset_index(drop=True)
df_downsampled.label.value_counts()

  df_downsampled = df.groupby('label').apply(lambda x: x.sample(min_size, random_state=42)).reset_index(drop=True)


label
Irrelevant    12875
Negative      12875
Neutral       12875
Positive      12875
Name: count, dtype: int64

In [21]:
stopwords_set = set(stopwords.words('english'))
prter = PorterStemmer()

def extract_emojis(text):
    return [char for char in text if emoji.is_emoji(char)]

def preprocessing(text):
    emojis = extract_emojis(text)
    
    text_without_emojis = ''.join([char for char in text if not emoji.is_emoji(char)])

    text_without_emojis = re.sub('<[^>]*>', '', text_without_emojis) 
    text_without_emojis = re.sub('[\W+]', ' ', text_without_emojis.lower())  # Remove non-word characters


    text = [prter.stem(word) for word in text_without_emojis.split() if word not in stopwords_set]

    return " ".join(text), emojis


In [33]:
df_downsampled.text[0]


'It\'s good it\'s like a really good threesome on it, not like that other tweet that went if "u got upset about something a character does and u punched a hole in the wall that is fiction affecting reality"'

In [28]:
df_downsampled['text'] = df_downsampled['text'].apply(preprocessing)


In [30]:
df_downsampled.text[0]

('good like realli good threesom like tweet went u got upset someth charact u punch hole wall fiction affect realiti',
 [])

In [34]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(df_downsampled['text'],df_downsampled['label'],test_size=0.2,shuffle=True,stratify=None)


In [38]:
x_train.shape

(41200,)

In [37]:
y_test.shape

(10300,)

In [48]:
import re
import emoji
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.base import BaseEstimator, TransformerMixin

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stopwords_set = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def extract_emojis(self, text):
        return [char for char in text if emoji.is_emoji(char)]

    def preprocess_text(self, text):
        emojis = self.extract_emojis(text)
        text_no_emoji = ''.join([char for char in text if not emoji.is_emoji(char)])
        text_no_emoji = re.sub('<[^>]*>', '', text_no_emoji)
        text_no_emoji = re.sub('[\W+]', ' ', text_no_emoji.lower())
        words = [self.stemmer.stem(word) for word in text_no_emoji.split() if word not in self.stopwords_set]
        return " ".join(words)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, pd.Series):
            return X.apply(self.preprocess_text)
        else:
            return [self.preprocess_text(text) for text in X]


In [49]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

lr = Pipeline([
    ('cleaner', TextPreprocessor()),
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

lr.fit(x_train,y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
from sklearn.metrics import classification_report
y_pred = lr.predict(x_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

  Irrelevant       0.72      0.79      0.75      2538
    Negative       0.75      0.75      0.75      2522
     Neutral       0.77      0.70      0.73      2623
    Positive       0.73      0.73      0.73      2617

    accuracy                           0.74     10300
   macro avg       0.74      0.74      0.74     10300
weighted avg       0.74      0.74      0.74     10300



In [None]:
neg = 'FIX IT JESUS ! Please FIX IT ! What In the world is going on here.  @PlayStation @AskPlayStation @Playstationsup @Treyarch @CallofDuty negative 345 silver wolf error code pic.twitter.com/ziRyhrf59Q'
lr.predict([neg])

array(['Negative'], dtype=object)

In [53]:
irv = """Call of duty warzone (livestream) w/ subs #Warzone youtu.be/7BhH_pjOMU4 via @YouTube Please come watch this AMAZING Call of Duty Warzone stream from this AMAZING streamer! It'd be really, really nice to give him some views and likes as well! 😀 #COD #CallofDuty #Warzone"""
lr.predict([irv])

array(['Irrelevant'], dtype=object)

In [55]:
validation_df = pd.read_csv('data/twitter_validation.csv',header=None)
validation_df

Unnamed: 0,0,1,2,3
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [59]:
validation_df = validation_df.rename(columns={2: 'label', 3: 'text'})
validation_df = validation_df[['text','label']]
validation_df

Unnamed: 0,text,label
0,I mentioned on Facebook that I was struggling ...,Irrelevant
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral
2,@Microsoft Why do I pay for WORD when it funct...,Negative
3,"CSGO matchmaking is so full of closet hacking,...",Negative
4,Now the President is slapping Americans in the...,Neutral
...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,Irrelevant
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,Irrelevant
997,Today sucked so it’s time to drink wine n play...,Positive
998,Bought a fraction of Microsoft today. Small wins.,Positive


In [60]:
validation_df.isna().sum()

text     0
label    0
dtype: int64

In [None]:
y_pred = lr.predict(validation_df.text)
print(classification_report(validation_df.label, y_pred))

              precision    recall  f1-score   support

  Irrelevant       0.80      0.90      0.84       172
    Negative       0.84      0.83      0.84       266
     Neutral       0.86      0.82      0.84       285
    Positive       0.86      0.84      0.85       277

    accuracy                           0.84      1000
   macro avg       0.84      0.85      0.84      1000
weighted avg       0.84      0.84      0.84      1000

