# COURSE:   PGP [AI&ML]

## Learner :  Chaitanya Kumar Battula
## Module  : NLP
## Topic   : Twitter Hate

In [1]:
import pandas as pd, numpy as np
import os, re

#### Read in the csv using pandas 

In [2]:
df = pd.read_csv("TwitterHate.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
df.label.value_counts(normalize=True)

0    0.929854
1    0.070146
Name: label, dtype: float64

In [4]:
df.tweet.sample().values[0]

"#tbt when #drbanner gets  .. from #marvelstheavengers the #hulk follow @user he's awesome in every way "

#### Get the tweets into a list, for easy text clean up and manipulation

In [5]:
tweets = df.tweet.values

In [6]:
len(tweets)

31962

In [7]:
tweets[:5]

array([' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
       "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
       '  bihday your majesty',
       '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
       ' factsguide: society now    #motivation'], dtype=object)

The tweets contain - 
1. URLs
2. Hashtags
3. User handles
4. 'RT'

## Cleanup 

#### Normalizing case

In [8]:
tweets_lower = [twt.lower() for twt in tweets]

In [9]:
tweets_lower[:5]

[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

#### Remove user handles, begin with '@'

In [10]:
import re

In [11]:
re.sub("@\w+","", "@Rahim this course rocks! http://rahimbaig.com/ai")

' this course rocks! http://rahimbaig.com/ai'

In [12]:
tweets_nouser = [re.sub("@\w+","", twt) for twt in tweets_lower]

In [13]:
tweets_nouser[:5]

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

#### Remove URLs

In [14]:
re.sub("\w+://\S+","", "@Rahim this course rocks! http://rahimbaig.com/ai")

'@Rahim this course rocks! '

In [15]:
tweets_nourl = [re.sub("\w+://\S+","", twt) for twt in tweets_nouser]

In [16]:
tweets_nourl[:5]

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

#### Tokenze using Tweet Tokenizer from NLTK

In [17]:
from nltk.tokenize import TweetTokenizer

In [18]:
?TweetTokenizer()

Object `TweetTokenizer()` not found.


In [19]:
tkn = TweetTokenizer()

In [20]:
print(tkn.tokenize(tweets_nourl[0]))

['when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', '.', '#run']


In [21]:
tweet_token = [tkn.tokenize(sent) for sent in tweets_nourl]
print(tweet_token[0])

['when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', '.', '#run']


### Remove punctuations and stop words and other redundant terms tike 'rt', 'amp'
- Also remove hashtags

In [22]:
from nltk.corpus import stopwords
from string import punctuation

In [23]:
stop_nltk = stopwords.words("english")
stop_punct = list(punctuation)

In [24]:
stop_punct.extend(['...','``',"''",".."])

In [25]:
stop_context = ['rt', 'amp']

In [26]:
stop_final = stop_nltk + stop_punct + stop_context

#### Function to 
- remove stop words from a single tokenized sentence
- remove # tags
- remove terms with length = 1

In [27]:
def del_stop(sent):
    return [re.sub("#","",term) for term in sent if ((term not in stop_final) & (len(term)>1))]

In [28]:
del_stop(tweet_token[4])

['factsguide', 'society', 'motivation']

In [29]:
tweets_clean = [del_stop(tweet) for tweet in tweet_token]

#### Check out the top terms in the tweets

In [30]:
from collections import Counter

In [31]:
term_list = []
for tweet in tweets_clean:
    term_list.extend(tweet)

In [32]:
res = Counter(term_list)
res.most_common(10)

[('love', 2748),
 ('day', 2276),
 ('happy', 1684),
 ('time', 1131),
 ('life', 1118),
 ('like', 1047),
 ("i'm", 1018),
 ('today', 1013),
 ('new', 994),
 ('thankful', 946)]

## Data formatting for predictive modeling 

#### Join the tokens back into strings

In [33]:
tweets_clean[0]

['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run']

In [34]:
tweets_clean = [" ".join(tweet) for tweet in tweets_clean]

In [35]:
tweets_clean[0]

'father dysfunctional selfish drags kids dysfunction run'

### Separate X and Y and perform train test split, 70-30

In [36]:
len(tweets_clean)

31962

In [37]:
len(df.label)

31962

In [38]:
X = tweets_clean
y = df.label.values

####  Train test split

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)

### Create a document term matrix using count vectorizer

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
vectorizer = TfidfVectorizer(max_features = 5000)

In [42]:
len(X_train), len(X_test)

(22373, 9589)

In [43]:
X_train_bow = vectorizer.fit_transform(X_train)

X_test_bow = vectorizer.transform(X_test)

In [44]:
X_train_bow.shape, X_test_bow.shape

((22373, 5000), (9589, 5000))

### Model building

### Using a *simple* Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
logreg = LogisticRegression()

In [47]:
logreg.fit(X_train_bow, y_train)

LogisticRegression()

In [48]:
y_train_pred = logreg.predict(X_train_bow)
y_test_pred = logreg.predict(X_test_bow)

In [49]:
from sklearn.metrics import accuracy_score, classification_report

In [50]:
accuracy_score(y_train, y_train_pred)

0.9560184150538595

In [51]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     20815
           1       0.96      0.39      0.55      1558

    accuracy                           0.96     22373
   macro avg       0.96      0.69      0.76     22373
weighted avg       0.96      0.96      0.95     22373



#### Adjusting for class imbalance

In [52]:
logreg = LogisticRegression(class_weight="balanced")

In [53]:
logreg.fit(X_train_bow, y_train)

LogisticRegression(class_weight='balanced')

In [54]:
y_train_pred = logreg.predict(X_train_bow)
y_test_pred = logreg.predict(X_test_bow)

In [55]:
accuracy_score(y_train, y_train_pred)

0.9527108568363652

In [56]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     20815
           1       0.60      0.97      0.74      1558

    accuracy                           0.95     22373
   macro avg       0.80      0.96      0.86     22373
weighted avg       0.97      0.95      0.96     22373



In [57]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [58]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'C': [0.01,0.1,1,10,100],
    'penalty': ["l1","l2"]
}

In [59]:
?LogisticRegression()

Object `LogisticRegression()` not found.


In [60]:
classifier_lr = LogisticRegression(class_weight="balanced")

In [61]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = classifier_lr, param_grid = param_grid, 
                          cv = StratifiedKFold(4), n_jobs = -1, verbose = 1, scoring = "recall" )

In [62]:
grid_search.fit(X_train_bow, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


        nan 0.73170852        nan 0.69640927]


GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=False),
             estimator=LogisticRegression(class_weight='balanced'), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
             scoring='recall', verbose=1)

In [63]:
grid_search.best_estimator_

LogisticRegression(C=1, class_weight='balanced')

### Using the best estimator to make predictions on the test set

In [64]:
y_test_pred = grid_search.best_estimator_.predict(X_test_bow)

In [65]:
y_train_pred = grid_search.best_estimator_.predict(X_train_bow)

In [66]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      8905
           1       0.49      0.77      0.60       684

    accuracy                           0.93      9589
   macro avg       0.73      0.85      0.78      9589
weighted avg       0.95      0.93      0.93      9589



# End Of  Practice Project