<a href="https://colab.research.google.com/github/Coolinglass/Applied-Machine-Learning-Projects/blob/master/Text%20Model%20Prediction_Twitter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Table of Contents
### 1. Loading the Dataset
### 2. Pre-processing the Dataset
### 3. Feature Engineering and Model Building
> ##### a. Creating Meta Features
> ##### b. Counting Nouns and Verbs
> ##### c. Model Building for Meta Features
> ##### d. Tf-Idf Features
> ##### e. Model Building for Complete Feature Set

## 1. Loading the dataset

In [None]:
# Import libraries
import pandas as pd
import string

In [None]:
# Load dataset
data = pd.read_csv('train_E6oV3lV.csv')
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [None]:
# distribution of class
data['label'].value_counts(normalize=True)

0    0.929854
1    0.070146
Name: label, dtype: float64

## 2. Pre-processing the dataset

In [None]:
# Sample document: lowercase
cleaned = data['tweet'][0].lower()

In [None]:
# Sample document
cleaned

' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

In [None]:
import re

In [None]:
cleaned = re.sub('@\w+','',cleaned)
cleaned =re.sub('#\w+','',cleaned)

In [None]:
cleaned

'  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   '

In [None]:
# Pre-initialised list of punctuations
punctuations = string.punctuation

In [None]:
# Pre-initialised list of punctuations
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
# Sample document: Remove punctuations
cleaned = "".join(character for character in cleaned if character not in punctuations)

In [None]:
# Sample document
cleaned

'  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction   '

In [None]:
# Import spacy
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

In [None]:
# spacy document
my_doc = nlp(cleaned)

In [None]:
# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)

In [None]:
# Import stop words
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
# Create list of word tokens after removing stopwords
filtered_sentence =[]

# Iterate over the tokens
for word in token_list:
    # Get token text
    lexeme = nlp.vocab[word]
    # Check if stopword or not
    if lexeme.is_stop == False:
        filtered_sentence.append(word)

# Print tokens and filtered sentence
print(token_list)
print(filtered_sentence)
cleaned = filtered_sentence

['  ', 'when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', '  ']
['  ', 'father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', '  ']


In [None]:
# Joining the tokenised words in sample document
cleaned = " ".join(cleaned)
cleaned

'   father dysfunctional selfish drags kids dysfunction   '

In [None]:
# Preprocessing function
def clean_text(text):
    ## lower case
    cleaned = text.lower()
    cleaned = re.sub('@\w+','',cleaned)
    cleaned =re.sub('#\w+','',cleaned)

    ## remove punctuations
    punctuations = string.punctuation
    cleaned = "".join(character for character in cleaned if character not in punctuations)

    ## remove stopwords
    my_doc = nlp(cleaned)
    token_list = []
    for token in my_doc:
        token_list.append(token.text)

    ## remove stop words
    filtered_sentence =[]

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word)

    ## Store cleaned document
    cleaned = filtered_sentence
    cleaned = " ".join(cleaned)

    return cleaned

In [None]:
# Applying the preprocessin function
data["cleaned"] = data["tweet"].apply(lambda x : clean_text(x))
data.head()

Unnamed: 0,id,label,tweet,cleaned
0,1,0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dys...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks credit nt use cause nt offer wheel...
2,3,0,bihday your majesty,bihday majesty
3,4,0,#model i love u take with u all the time in ...,love u u time urð± ðððð ð...
4,5,0,factsguide: society now #motivation,factsguide society


## 3. Feature Engineering and Model Building

### a. Creating Meta Features

> 1. Number of words in original text
> 2. Number of words in cleaned text
> 3. Number of characters including spaces in the cleaned text
> 4. Number of characters excluding spaces in the cleaned text
> 5. Number of digits in the cleaned text

In [None]:
# Creating meta features

# Number of words in original text
data["word_count"] = data["tweet"].apply(lambda x : len(x.split()))
# Number of words in cleaned text
data["word_count_cleand"] = data["cleaned"].apply(lambda x : len(x.split()))

# Number of characters including spaces in the cleaned text
data["char_count"] = data["cleaned"].apply(lambda x : len(x))
# number of characters excluding spaces in the cleaned text
data["char_count_without_spaces"] = data["cleaned"].apply(lambda x : len(x.replace(" ","")))

# Number of digits in the cleaned text
data["num_dig"] = data["cleaned"].apply(lambda x :  sum([1 if w.isdigit() else 0 for w in x.split()]))

In [None]:
# Print dataset
data.head()

Unnamed: 0,id,label,tweet,cleaned,word_count,word_count_cleand,char_count,char_count_without_spaces,num_dig
0,1,0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dys...,18,6,57,46,0
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks credit nt use cause nt offer wheel...,19,10,65,46,0
2,3,0,bihday your majesty,bihday majesty,3,2,17,13,0
3,4,0,#model i love u take with u all the time in ...,love u u time urð± ðððð ð...,14,12,62,44,0
4,5,0,factsguide: society now #motivation,factsguide society,4,2,24,17,0


### b. Counting Nouns and Verbs

In [None]:
# Import spacy English language model
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
# spacy document
document = nlp(data['cleaned'][0])

In [None]:
# Sample document
document

   father dysfunctional selfish drags kids dysfunction   

In [None]:
# POS tags
all_tags = []
for w in document:
    all_tags.append(w.tag_)

In [None]:
# POS tags
all_tags

['_SP', 'NNP', 'NNP', 'JJ', 'NNS', 'NNS', 'NN', '_SP']

In [None]:
# Dictionary of noun and verb POS tags
pos_dic = {"noun" : ["NNP", "NN", "NNS", "NNPS"], "verb" : ["VBZ", "VB", "VBD","VBG", "VBN"]}

In [None]:
# Sample document: Noun count
count = 0
for tag in all_tags:
    if tag in pos_dic['noun']:
        count += 1

In [None]:
# Sample document: Noun count
count

5

In [None]:
# Function for noun and verb counts
def pos_check(txt, family):

    # spacy document
    txt = nlp(txt)

    all_tags = []

    # Get pos tag
    for w in txt:
        all_tags.append(w.tag_)

    count = 0

    # Count number of nouns and verbs
    for tag in all_tags:
        if tag in pos_dic[family]:
            count += 1

    return count

In [None]:
# Sample document: Noun count
pos_check("They are playing in the ground", "verb")

1

In [None]:
# Sample document: Verb count
pos_check("They are playing in the ground", "noun")

1

In [None]:
# Applying the function
data["noun_count"] = data["cleaned"].apply(lambda x : pos_check(x, "noun"))
data["verb_count"] = data["cleaned"].apply(lambda x : pos_check(x, "verb"))

In [None]:
# Print output
data.head()

Unnamed: 0,id,label,tweet,cleaned,word_count,word_count_cleand,char_count,char_count_without_spaces,num_dig,noun_count,verb_count
0,1,0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dys...,18,6,57,46,0,5,0
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks credit nt use cause nt offer wheel...,19,10,65,46,0,5,2
2,3,0,bihday your majesty,bihday majesty,3,2,17,13,0,2,0
3,4,0,#model i love u take with u all the time in ...,love u u time urð± ðððð ð...,14,12,62,44,0,8,0
4,5,0,factsguide: society now #motivation,factsguide society,4,2,24,17,0,2,0


### c. Model Building for Meta Features

In [None]:
# Label encoding target variable
from sklearn.preprocessing import LabelEncoder

target = data["label"].values
target = LabelEncoder().fit_transform(target)

In [None]:
# List of features
train = data[['word_count', 'word_count_cleand', 'char_count',
              'char_count_without_spaces', 'num_dig', 'noun_count',
              'verb_count']]

In [None]:
# Train-Validation split
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(train, target, test_size = 0.2, random_state=20, stratify=target)

In [None]:
# Train and Validatio dataset
(x_train.shape, y_train.shape), (x_valid.shape, y_valid.shape)

(((25569, 7), (25569,)), ((6393, 7), (6393,)))

In [None]:
# Naive bayes
from sklearn import naive_bayes

In [None]:
# Multinomial naive bayes
model = naive_bayes.MultinomialNB()

In [None]:
# Fit model on training data
model.fit(x_train, y_train)

MultinomialNB()

In [None]:
# Prediction on training data
pred_train = model.predict(x_train)
# Prediction on validation data
pred_valid = model.predict(x_valid)

In [None]:
# Accuracy
from sklearn.metrics import accuracy_score

In [None]:
# Training accuracy
accuracy_score(y_train, pred_train)

0.9298369118854863

In [None]:
# Validation accuracy
accuracy_score(y_valid, pred_valid)

0.9299233536680744

In [None]:
pred_valid

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [None]:
from sklearn.metrics import f1_score

In [None]:
# Training accuracy
f1_score(y_train, pred_train)

0.0

In [None]:
f1_score(y_valid, pred_valid)

0.0

In [None]:
#Importing the required libraries
from sklearn import datasets
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:

precision_score(y_valid,pred_valid)

  _warn_prf(average, modifier, msg_start, len(result))


0.0

### d. Tf-idf Features

In [None]:
# Import Tf-Idf Vectoriser
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Tf-Idf Vectoriser
word_tfidf = TfidfVectorizer(max_features=500)

In [None]:
# Fit Tf-Idf Vectoriser
word_tfidf.fit(data["cleaned"].values)

TfidfVectorizer(max_features=500)

In [None]:
# Transform
word_vectors_tfidf = word_tfidf.transform(data["cleaned"].values)

In [None]:
# Tf-Idf vectors
word_vectors_tfidf

<31962x500 sparse matrix of type '<class 'numpy.float64'>'
	with 72898 stored elements in Compressed Sparse Row format>

In [None]:
# Combining meta features and Tf-Idf features
from scipy.sparse import hstack, csr_matrix

# List of meta features
meta_features = ['word_count', 'word_count_cleand',
       'char_count', 'char_count_without_spaces', 'num_dig', 'noun_count',
       'verb_count']

# Meta features
feature_set1 = data[meta_features]

# Combined features
train = hstack([word_vectors_tfidf, csr_matrix(feature_set1)], "csr")

### e. Model Building for complete feature set

In [None]:
# Train and Validation datasets
x_train, x_valid, y_train, y_valid = train_test_split(train, target, random_state=20, stratify=target)

In [None]:
# Train and Validation datasets
(x_train.shape, y_train.shape), (x_valid.shape, y_valid.shape)

(((23971, 507), (23971,)), ((7991, 507), (7991,)))

In [None]:
# Multinomial Naive Bayes Model
model = naive_bayes.MultinomialNB()

In [None]:
# Fit model
model.fit(x_train, y_train)

MultinomialNB()

In [None]:
# Predcition on Training data
pred_train = model.predict(x_train)
# Predcition on Validation data
pred_valid = model.predict(x_valid)

In [None]:
# Training accuracy
accuracy_score(y_train, pred_train)

0.9396353927662592

In [None]:
# Validation accuracy
accuracy_score(y_valid, pred_valid)

0.9398072831935927

In [None]:
f1_score(y_valid, pred_valid)

0.2895125553914328

In [None]:
f1_score(y_train, pred_train)

0.299273607748184

In [None]:
test = pd.read_csv('test_tweets_anuFYb8.csv')

In [None]:
test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [None]:
test["cleaned"] = test["tweet"].apply(lambda x : clean_text(x))

In [None]:
test.head()

Unnamed: 0,id,tweet,cleaned
0,31963,#studiolife #aislife #requires #passion #dedic...,find  ¦
1,31964,@user #white #supremacists want everyone to s...,want new â  â hereâs
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways heal
3,31966,is the hp and the cursed child book up for res...,hp cursed child book reservations yes ðð...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",3rd amazing hilarious eli ahmir uncle d...


In [None]:
# Creating meta features

# Number of words in original text
test["word_count"] = test["tweet"].apply(lambda x : len(x.split()))
# Number of words in cleaned text
test["word_count_cleand"] = test["cleaned"].apply(lambda x : len(x.split()))

# Number of characters including spaces in the cleaned text
test["char_count"] = test["cleaned"].apply(lambda x : len(x))
# number of characters excluding spaces in the cleaned text
test["char_count_without_spaces"] = test["cleaned"].apply(lambda x : len(x.replace(" ","")))

# Number of digits in the cleaned text
test["num_dig"] = test["cleaned"].apply(lambda x :  sum([1 if w.isdigit() else 0 for w in x.split()]))

In [None]:
test.head()

Unnamed: 0,id,tweet,cleaned,word_count,word_count_cleand,char_count,char_count_without_spaces,num_dig
0,31963,#studiolife #aislife #requires #passion #dedic...,find  ¦,9,3,17,6,0
1,31964,@user #white #supremacists want everyone to s...,want new â  â hereâs,16,6,39,23,0
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways heal,9,3,24,12,0
3,31966,is the hp and the cursed child book up for res...,hp cursed child book reservations yes ðð...,22,7,55,44,0
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",3rd amazing hilarious eli ahmir uncle d...,15,10,66,50,0


In [None]:
# Applying the function
test["noun_count"] = test["cleaned"].apply(lambda x : pos_check(x, "noun"))
test["verb_count"] = test["cleaned"].apply(lambda x : pos_check(x, "verb"))

In [None]:
test.head()

Unnamed: 0,id,tweet,cleaned,word_count,word_count_cleand,char_count,char_count_without_spaces,num_dig,noun_count,verb_count
0,31963,#studiolife #aislife #requires #passion #dedic...,find  ¦,9,3,17,6,0,1,0
1,31964,@user #white #supremacists want everyone to s...,want new â  â hereâs,16,6,39,23,0,3,0
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways heal,9,3,24,12,0,1,0
3,31966,is the hp and the cursed child book up for res...,hp cursed child book reservations yes ðð...,22,7,55,44,0,4,1
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",3rd amazing hilarious eli ahmir uncle d...,15,10,66,50,0,7,1


In [None]:
# List of features
test_data = test[['word_count', 'word_count_cleand', 'char_count',
              'char_count_without_spaces', 'num_dig', 'noun_count',
              'verb_count']]

In [None]:
test_data

Unnamed: 0,word_count,word_count_cleand,char_count,char_count_without_spaces,num_dig,noun_count,verb_count
0,9,3,17,6,0,1,0
1,16,6,39,23,0,3,0
2,9,3,24,12,0,1,0
3,22,7,55,44,0,4,1
4,15,10,66,50,0,7,1
...,...,...,...,...,...,...,...
17192,11,5,50,38,0,4,1
17193,13,6,39,25,0,2,3
17194,17,7,61,46,0,6,0
17195,12,8,81,69,0,5,1


In [None]:
test3 = model.predict(test_data)

In [None]:
test_data_list = model.predict(test_data)

In [None]:
test_data_list

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [None]:
final_testing = test

In [None]:
final_testing.drop(['cleaned','word_count'],axis=1)

Unnamed: 0,id,tweet,word_count_cleand,char_count,char_count_without_spaces,num_dig,noun_count,verb_count
0,31963,#studiolife #aislife #requires #passion #dedic...,3,17,6,0,1,0
1,31964,@user #white #supremacists want everyone to s...,6,39,23,0,3,0
2,31965,safe ways to heal your #acne!! #altwaystohe...,3,24,12,0,1,0
3,31966,is the hp and the cursed child book up for res...,7,55,44,0,4,1
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",10,66,50,0,7,1
...,...,...,...,...,...,...,...,...
17192,49155,thought factory: left-right polarisation! #tru...,5,50,38,0,4,1
17193,49156,feeling like a mermaid ð #hairflip #neverre...,6,39,25,0,2,3
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...,7,61,46,0,6,0
17195,49158,"happy, at work conference: right mindset leads...",8,81,69,0,5,1


In [None]:
final_testing.drop(['word_count_cleand','char_count',],axis=1)

Unnamed: 0,id,tweet,cleaned,word_count,char_count_without_spaces,num_dig,noun_count,verb_count
0,31963,#studiolife #aislife #requires #passion #dedic...,find  ¦,9,6,0,1,0
1,31964,@user #white #supremacists want everyone to s...,want new â  â hereâs,16,23,0,3,0
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways heal,9,12,0,1,0
3,31966,is the hp and the cursed child book up for res...,hp cursed child book reservations yes ðð...,22,44,0,4,1
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",3rd amazing hilarious eli ahmir uncle d...,15,50,0,7,1
...,...,...,...,...,...,...,...,...
17192,49155,thought factory: left-right polarisation! #tru...,thought factory leftright polarisation ...,11,38,0,4,1
17193,49156,feeling like a mermaid ð #hairflip #neverre...,feeling like mermaid ð â ¦,13,25,0,2,3
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...,today omg amp words like assetsampliability...,17,46,0,6,0
17195,49158,"happy, at work conference: right mindset leads...",happy work conference right mindset leads cult...,12,69,0,5,1


In [None]:
final = final_testing.drop(['cleaned','word_count','char_count_without_spaces','num_dig','noun_count','verb_count'],axis=1)

In [None]:
final_testing.drop(['word_count_cleand','char_count',],axis=1)

Unnamed: 0,id,tweet,cleaned,word_count,char_count_without_spaces,num_dig,noun_count,verb_count
0,31963,#studiolife #aislife #requires #passion #dedic...,find  ¦,9,6,0,1,0
1,31964,@user #white #supremacists want everyone to s...,want new â  â hereâs,16,23,0,3,0
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways heal,9,12,0,1,0
3,31966,is the hp and the cursed child book up for res...,hp cursed child book reservations yes ðð...,22,44,0,4,1
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",3rd amazing hilarious eli ahmir uncle d...,15,50,0,7,1
...,...,...,...,...,...,...,...,...
17192,49155,thought factory: left-right polarisation! #tru...,thought factory leftright polarisation ...,11,38,0,4,1
17193,49156,feeling like a mermaid ð #hairflip #neverre...,feeling like mermaid ð â ¦,13,25,0,2,3
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...,today omg amp words like assetsampliability...,17,46,0,6,0
17195,49158,"happy, at work conference: right mindset leads...",happy work conference right mindset leads cult...,12,69,0,5,1


In [None]:
final = final.drop(['word_count_cleand','char_count',],axis=1)

In [None]:
final

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."
...,...,...
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."


In [None]:
final = final.drop(['tweet'],axis=1)

In [None]:
final

Unnamed: 0,id
0,31963
1,31964
2,31965
3,31966
4,31967
...,...
17192,49155
17193,49156
17194,49157
17195,49158


In [None]:
final['pred'] = test_data_list

In [None]:
final['pred'].describe()

count    17197.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: pred, dtype: float64