In [26]:
import pandas as pd
import string
import nltk

In [27]:
df=pd.read_csv('twitter_training.csv')

In [112]:
df.head()
df.dropna(inplace=True)

In [113]:
## Convert to lowercase
df['clean_review'] = df['review'].str.lower()
df.head(2)

Unnamed: 0,review,clean_review,stemmed_review,lemmatized_review
0,I am ? This coming to the borders and I will k...,i am ? this coming to the borders and i will k...,come border kill,come border kill
1,Im getting!! on borderlands and i will kill yo...,im getting!! on borderlands and i will kill yo...,im get borderland kill,im get borderland kill


In [114]:
#  Punctuations
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [115]:
def rm_punc(z):
    if not isinstance(z, str):  
        z = ""  
    punctuations = string.punctuation
    return z.translate(str.maketrans('', '', punctuations))

In [116]:
df['clean_review'] = df['clean_review'].apply(lambda y: rm_punc(y))
df.head()

Unnamed: 0,review,clean_review,stemmed_review,lemmatized_review
0,I am ? This coming to the borders and I will k...,i am this coming to the borders and i will ki...,come border kill,come border kill
1,Im getting!! on borderlands and i will kill yo...,im getting on borderlands and i will kill you all,im get borderland kill,im get borderland kill
2,im coming. on borderlands and i will murder yo...,im coming on borderlands and i will murder you...,im come borderland murder,im come borderland murder
3,im getting informations on borderlands 2 and i...,im getting informations on borderlands 2 and i...,im get inform borderland 2 murder,im get information borderlands 2 murder
4,im getting into borderlands and i can murder y...,im getting into borderlands and i can murder y...,im get borderland murder,im get borderland murder


In [117]:
# Stopwords
from nltk.corpus import stopwords

In [118]:
STOPWORDS = set(stopwords.words('english'))
def rm_sw(x):
    return " ".join([word for word in x.split() if word not in STOPWORDS])

In [119]:
df['clean_review'] = df['clean_review'].apply(lambda y: rm_sw(y))
df.head()

Unnamed: 0,review,clean_review,stemmed_review,lemmatized_review
0,I am ? This coming to the borders and I will k...,coming borders kill,come border kill,come border kill
1,Im getting!! on borderlands and i will kill yo...,im getting borderlands kill,im get borderland kill,im get borderland kill
2,im coming. on borderlands and i will murder yo...,im coming borderlands murder,im come borderland murder,im come borderland murder
3,im getting informations on borderlands 2 and i...,im getting informations borderlands 2 murder,im get inform borderland 2 murder,im get information borderlands 2 murder
4,im getting into borderlands and i can murder y...,im getting borderlands murder,im get borderland murder,im get borderland murder


In [120]:
# remove_url
import re
def rm_url (x):
    text = re.sub(r'https?://\s+|www\.\s+',  '', x)
    
    return x

In [121]:
df['clean_review'] = df['clean_review'].apply(lambda y: rm_url(y))
df.head()

Unnamed: 0,review,clean_review,stemmed_review,lemmatized_review
0,I am ? This coming to the borders and I will k...,coming borders kill,come border kill,come border kill
1,Im getting!! on borderlands and i will kill yo...,im getting borderlands kill,im get borderland kill,im get borderland kill
2,im coming. on borderlands and i will murder yo...,im coming borderlands murder,im come borderland murder,im come borderland murder
3,im getting informations on borderlands 2 and i...,im getting informations borderlands 2 murder,im get inform borderland 2 murder,im get information borderlands 2 murder
4,im getting into borderlands and i can murder y...,im getting borderlands murder,im get borderland murder,im get borderland murder


In [122]:
# Remove Special characters

def rm_spl_char(x):
    x = re.sub('[^a-zA-Z0-9]', ' ', x)
    x = re.sub('\s+', ' ', x)
    return x

In [123]:
df['clean_review'] = df['clean_review'].apply(lambda y: rm_spl_char(y))
df.head()

Unnamed: 0,review,clean_review,stemmed_review,lemmatized_review
0,I am ? This coming to the borders and I will k...,coming borders kill,come border kill,come border kill
1,Im getting!! on borderlands and i will kill yo...,im getting borderlands kill,im get borderland kill,im get borderland kill
2,im coming. on borderlands and i will murder yo...,im coming borderlands murder,im come borderland murder,im come borderland murder
3,im getting informations on borderlands 2 and i...,im getting informations borderlands 2 murder,im get inform borderland 2 murder,im get information borderlands 2 murder
4,im getting into borderlands and i can murder y...,im getting borderlands murder,im get borderland murder,im get borderland murder


In [124]:
# Stemming

from nltk.stem.porter import PorterStemmer
PorterStemmer = PorterStemmer()
def stemming(x):
    return " ".join([PorterStemmer.stem(word) for word in x.split()])

In [125]:
df['stemmed_review'] = df['clean_review'].apply(lambda y: stemming(y))
df.head()

Unnamed: 0,review,clean_review,stemmed_review,lemmatized_review
0,I am ? This coming to the borders and I will k...,coming borders kill,come border kill,come border kill
1,Im getting!! on borderlands and i will kill yo...,im getting borderlands kill,im get borderland kill,im get borderland kill
2,im coming. on borderlands and i will murder yo...,im coming borderlands murder,im come borderland murder,im come borderland murder
3,im getting informations on borderlands 2 and i...,im getting informations borderlands 2 murder,im get inform borderland 2 murder,im get information borderlands 2 murder
4,im getting into borderlands and i can murder y...,im getting borderlands murder,im get borderland murder,im get borderland murder


In [126]:
# Lemmatizing
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

WordNetLemmatizer =WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize(x):
    # find pos tags
    pos_text = pos_tag(x.split())
    return " ".join([WordNetLemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

In [94]:
df['lemmatized_review'] = df['clean_review'].apply(lambda y: lemmatize(y))
df.head()

Unnamed: 0,review,clean_review,stemmed_review,lemmatized_review
0,I am ? This coming to the borders and I will k...,coming borders kill,come border kill,come border kill
1,Im getting!! on borderlands and i will kill yo...,im getting borderlands kill,im get borderland kill,im get borderland kill
2,im coming. on borderlands and i will murder yo...,im coming borderlands murder,im come borderland murder,im come borderland murder
3,im getting informations on borderlands 2 and i...,im getting informations borderlands 2 murder,im get inform borderland 2 murder,im get information borderlands 2 murder
4,im getting into borderlands and i can murder y...,im getting borderlands murder,im get borderland murder,im get borderland murder


In [95]:
# Tagging Parts of Speech

from nltk.tokenize import word_tokenize

def tokenize_text(x):
    return word_tokenize(x)

tok = df['lemmatized_review'].apply(lambda y: tokenize_text(y))


In [96]:
def apply_pos_tagging(tokens):
    return pos_tag(tokens)

tps= tok.apply(lambda y: apply_pos_tagging(y))

print(tps)

0                 [(come, VBN), (border, NN), (kill, NN)]
1       [(im, NN), (get, NN), (borderland, NN), (kill,...
2       [(im, JJ), (come, VBN), (borderland, NN), (mur...
3       [(im, JJ), (get, VB), (information, NN), (bord...
4       [(im, NN), (get, NN), (borderland, NN), (murde...
                              ...                        
1996    [(borderland, NN), (need, MD), (help, VB), (ga...
1997    [(congratulation, NN), (borderlands, VBZ), (re...
1998    [(congratulation, NN), (borderlands, VBZ), (re...
1999    [(congratulation, NN), (borderlands, VBZ), (re...
2000    [(congratulation, NN), (borderlands, VBZ), (re...
Name: lemmatized_review, Length: 1995, dtype: object


In [97]:
# Named-Entity Recognition

import spacy
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [98]:
def entities(x):
    doc = nlp(x)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

ndr = df['lemmatized_review'].apply(entities)

print(ndr)

0                                                      []
1                                                      []
2                                                      []
3                                         [(2, CARDINAL)]
4                                                      []
                              ...                        
1996                                                   []
1997    [(congratulation borderlands research institut...
1998    [(congratulation borderlands research institut...
1999    [(congratulation borderlands research institut...
2000    [(congratulation borderlands research institut...
Name: lemmatized_review, Length: 1995, dtype: object


In [99]:
# Dependency Parsing
def dependencies(x):
    doc = nlp(x)
    dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
    return dependencies

par = df['lemmatized_review'].apply(lambda y: dependencies(y))

print(par)

0       [(come, ROOT, come), (border, compound, kill),...
1       [(i, nsubj, get), (m, aux, get), (get, ROOT, g...
2       [(i, nsubjpass, come), (m, auxpass, come), (co...
3       [(i, nsubj, get), (m, aux, get), (get, ROOT, g...
4       [(i, nsubj, get), (m, aux, get), (get, ROOT, g...
                              ...                        
1996    [(borderland, intj, need), (need, ROOT, need),...
1997    [(congratulation, compound, borderlands), (bor...
1998    [(congratulation, compound, borderlands), (bor...
1999    [(congratulation, compound, borderlands), (bor...
2000    [(congratulation, compound, borderlands), (bor...
Name: lemmatized_review, Length: 1995, dtype: object


In [135]:
# Rule-Based Matching 
from spacy.matcher import Matcher
match = Matcher(nlp.vocab)
pattern = [{'LOWER': 'kill'}]
match.add("killPattern", [pattern])

In [136]:
def matching(x):
    doc = nlp(x)
    matches = match(doc)
    matched_tokens = [doc[start:end].text for match_id, start, end in matches]
    return matched_tokens

m = df['lemmatized_review'].apply(lambda y: matching(y))

print(m)

0       [kill]
1       [kill]
2           []
3           []
4           []
         ...  
1996        []
1997        []
1998        []
1999        []
2000        []
Name: lemmatized_review, Length: 1995, dtype: object


In [137]:
X=df['lemmatized_review']

In [138]:
# Vectorization tf_idf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [139]:
X = tfidf.fit_transform(X)


In [140]:
# split
from sklearn.model_selection import train_test_split
X_train , X_test , Y_train , Y_test  = train_test_split(X, y, test_size = 0.2 ,  random_state = 42)

In [141]:
# Model
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [142]:
model = SVC()
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.042606516290726815
