In [1]:
import pandas as pd
import numpy as np
import string  
import nltk 
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
import re
from nltk.tokenize import TweetTokenizer
from itertools import chain
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the tweets file 
TwitterHate_df = pd.read_csv("TwitterHate.csv")

In [3]:
TwitterHate_df.head(8)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...


In [4]:
TwitterHate_df.shape

(31962, 3)

In [5]:
TwitterHate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [6]:
# imbalance dataset
TwitterHate_df.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [7]:
# zero null value
TwitterHate_df.isna().sum()

id       0
label    0
tweet    0
dtype: int64

In [8]:
# drop id column
TwitterHate_df.drop(['id'],inplace=True,axis=1)

In [9]:
TwitterHate_df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [10]:
# Get the tweets into a list for easy text cleanup and manipulation
tweets = TwitterHate_df['tweet'].tolist()
type(tweets)

list

In [11]:
# Normalize the casing
tweets_preprocess = [twt.lower() for twt in tweets]

In [12]:
def preprocess_text(tweets):
     
    # Replace URLs
    tweets_preprocess = [re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)',"", twt) for twt in tweets]
    
    # Remove user handles. These begin with '@’. 
    tweets_preprocess = [re.sub("@\w+","", twt) for twt in tweets_preprocess]
    
    # Remove punctuation and ‘#’ symbols from the tweet while retaining the term.
    tweets_preprocess = [re.sub(r'[^\w\d\s]',"", twt) for twt in tweets_preprocess] 
    
    # Remove redundant terms like ‘amp’, ‘rt’
    tweets_preprocess = [re.sub(r'rt', ' ', twt)  for twt in tweets_preprocess] 
    tweets_preprocess = [re.sub(r'amp', ' ', twt)  for twt in tweets_preprocess] 
    tweets_preprocess = [re.sub(r'ð', ' ', twt)  for twt in tweets_preprocess] 
    
    # Remove numbers
    tweets_preprocess = [re.sub(r'\d+(\.\d+)?', ' ', twt)  for twt in tweets_preprocess] 
    
    #tokenize the tweets into individual terms
    tweet_token = [tkn.tokenize(sent) for sent in tweets_preprocess]
    
    return tweet_token

In [13]:
tkn = TweetTokenizer()
tweets = preprocess_text(tweets_preprocess)
type(tweets)

list

In [14]:
print(tweets[3])

['model', 'i', 'love', 'u', 'take', 'with', 'u', 'all', 'the', 'time', 'in', 'ur']


In [15]:
# Remove stop words.
# Remove terms with a length of 1.
def del_stop(sent):
    return [term for term in sent if ((term not in stop_words) & (len(term)>1))]
 
tweets_clean = [del_stop(tweet) for tweet in tweets] 

In [16]:
print(tweets_clean[3])

['model', 'love', 'take', 'time', 'ur']


In [17]:
# Get all the tokenized terms into one large list.
terms = list(chain.from_iterable(tweets_clean))
# Find the 10 most common terms using counter
terms = Counter(terms)
terms.most_common(10)

[('love', 2673),
 ('day', 2271),
 ('happy', 1672),
 ('im', 1143),
 ('like', 1139),
 ('time', 1123),
 ('life', 1098),
 ('today', 1001),
 ('new', 983),
 ('positive', 928)]

In [18]:
# Join the tokens back to form strings
tweets_clean = [' '.join(twe) for twe in tweets_clean] 
tweets_clean[0:2]

['father dysfunctional selfish drags kids dysfunction run',
 'thanks lyft credit cant use cause dont offer wheelchair vans pdx disapointed getthanked']

In [19]:
# Assign x and y
X = tweets_clean
Y = TwitterHate_df['label']

In [20]:
# Split into train and test set
X_train, X_test , Y_train , Y_test = train_test_split(X, Y, test_size = 0.30, random_state=42, stratify=Y)

In [21]:
# build a vocabulary of 5000
vectorizer = TfidfVectorizer(max_features = 5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [22]:
X_train_vect.shape , X_test_vect.shape

((22373, 5000), (9589, 5000))

In [23]:
# Instantiate Logistic Regression and fit into the train data
model = LogisticRegression()
model.fit(X_train_vect,Y_train)

LogisticRegression()

In [24]:
# Make predictions for the train set
y_predict_train = model.predict(X_train_vect)

In [25]:
# Accuracy on the train set
accuracy_score(Y_train, y_predict_train)

0.9531578241630537

In [26]:
print(classification_report(Y_train, y_predict_train))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98     20804
           1       0.95      0.35      0.51      1569

    accuracy                           0.95     22373
   macro avg       0.95      0.68      0.74     22373
weighted avg       0.95      0.95      0.94     22373



In [27]:
# Adjust class_weight in the LogisticRegression model
model = LogisticRegression(class_weight='balanced')

In [28]:
# Train again with the adjustment
model.fit(X_train_vect,Y_train)
y_predict_train = model.predict(X_train_vect)

In [29]:
print(classification_report(Y_train, y_predict_train))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     20804
           1       0.57      0.97      0.72      1569

    accuracy                           0.95     22373
   macro avg       0.79      0.96      0.85     22373
weighted avg       0.97      0.95      0.95     22373



In [30]:
# Provide the parameter for ‘C’ and ‘penalty’
grid_para ={
    'C':[0.1, 1, 10],
    'penalty': ["l1","l2"]
}

In [31]:
model = LogisticRegression(class_weight='balanced')

In [32]:
# Choose ‘recall’ as the metric for scoring and 4 fold cross validation.
GridSearch = GridSearchCV(estimator=model,param_grid=grid_para, cv=StratifiedKFold(4),scoring='recall')
GridSearch.fit(X_train_vect,Y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=False),
             estimator=LogisticRegression(class_weight='balanced'),
             param_grid={'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
             scoring='recall')

In [33]:
# Check the best parameters
GridSearch.best_estimator_

LogisticRegression(C=0.1, class_weight='balanced')

In [34]:
# Predict and evaluate using the best estimator
y_predict_test = GridSearch.best_estimator_.predict(X_test_vect)

In [35]:
print(classification_report(Y_test, y_predict_test))

              precision    recall  f1-score   support

           0       0.98      0.92      0.95      8916
           1       0.44      0.77      0.56       673

    accuracy                           0.91      9589
   macro avg       0.71      0.85      0.76      9589
weighted avg       0.94      0.91      0.92      9589

