In [1]:
import pandas as pd
import string
import nltk

In [2]:
df=pd.read_csv('twitter_training.csv')

In [3]:
df.head()

Unnamed: 0,num,Subject,Kind,tweet
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [4]:
df['Kind'] = df['Kind'].astype('category')
df['Kind'] = df['Kind'].cat.codes
df['Kind']

0        3
1        3
2        3
3        3
4        3
        ..
74676    3
74677    3
74678    3
74679    3
74680    3
Name: Kind, Length: 74681, dtype: int8

In [5]:
y=df['Kind']

In [6]:
df.drop(['Kind','num' ], axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,Subject,tweet
0,Borderlands,I am coming to the borders and I will kill you...
1,Borderlands,im getting on borderlands and i will kill you ...
2,Borderlands,im coming on borderlands and i will murder you...
3,Borderlands,im getting on borderlands 2 and i will murder ...
4,Borderlands,im getting into borderlands and i can murder y...


In [8]:
## Convert to lowercase
df['clean_tweet'] = df['tweet'].str.lower()
df.head(2)

Unnamed: 0,Subject,tweet,clean_tweet
0,Borderlands,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...
1,Borderlands,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you ...


In [9]:
#  Punctuations
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
def rm_punc(z):
    if not isinstance(z, str):  
        z = ""  
    punctuations = string.punctuation
    return z.translate(str.maketrans('', '', punctuations))

In [11]:
df['clean_tweet'] = df['clean_tweet'].apply(lambda y: rm_punc(y))
df.head()

Unnamed: 0,Subject,tweet,clean_tweet
0,Borderlands,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...
1,Borderlands,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you all
2,Borderlands,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
3,Borderlands,im getting on borderlands 2 and i will murder ...,im getting on borderlands 2 and i will murder ...
4,Borderlands,im getting into borderlands and i can murder y...,im getting into borderlands and i can murder y...


In [13]:
# Stopwords
from nltk.corpus import stopwords

In [14]:
STOPWORDS = set(stopwords.words('english'))
def rm_sw(x):
    return " ".join([word for word in x.split() if word not in STOPWORDS])

In [15]:
df['clean_tweet'] = df['clean_tweet'].apply(lambda y: rm_sw(y))
df.head()

Unnamed: 0,Subject,tweet,clean_tweet
0,Borderlands,I am coming to the borders and I will kill you...,coming borders kill
1,Borderlands,im getting on borderlands and i will kill you ...,im getting borderlands kill
2,Borderlands,im coming on borderlands and i will murder you...,im coming borderlands murder
3,Borderlands,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder
4,Borderlands,im getting into borderlands and i can murder y...,im getting borderlands murder


In [16]:
# remove_url
import re
def rm_url (x):
    text = re.sub(r'https?://\s+|www\.\s+',  '', x)
    
    return x

In [17]:
df['clean_tweet'] = df['clean_tweet'].apply(lambda y: rm_url(y))
df.head()

Unnamed: 0,Subject,tweet,clean_tweet
0,Borderlands,I am coming to the borders and I will kill you...,coming borders kill
1,Borderlands,im getting on borderlands and i will kill you ...,im getting borderlands kill
2,Borderlands,im coming on borderlands and i will murder you...,im coming borderlands murder
3,Borderlands,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder
4,Borderlands,im getting into borderlands and i can murder y...,im getting borderlands murder


In [18]:
# Remove Special characters

def rm_spl_char(x):
    x = re.sub('[^a-zA-Z0-9]', ' ', x)
    x = re.sub('\s+', ' ', x)
    return x

In [19]:
df['clean_tweet'] = df['clean_tweet'].apply(lambda y: rm_spl_char(y))
df.head()

Unnamed: 0,Subject,tweet,clean_tweet
0,Borderlands,I am coming to the borders and I will kill you...,coming borders kill
1,Borderlands,im getting on borderlands and i will kill you ...,im getting borderlands kill
2,Borderlands,im coming on borderlands and i will murder you...,im coming borderlands murder
3,Borderlands,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder
4,Borderlands,im getting into borderlands and i can murder y...,im getting borderlands murder


In [20]:
# Stemming

from nltk.stem.porter import PorterStemmer
PorterStemmer = PorterStemmer()
def stemming(x):
    return " ".join([PorterStemmer.stem(word) for word in x.split()])

In [21]:
df['stemmed_tweet'] = df['clean_tweet'].apply(lambda y: stemming(y))
df.head()

Unnamed: 0,Subject,tweet,clean_tweet,stemmed_tweet
0,Borderlands,I am coming to the borders and I will kill you...,coming borders kill,come border kill
1,Borderlands,im getting on borderlands and i will kill you ...,im getting borderlands kill,im get borderland kill
2,Borderlands,im coming on borderlands and i will murder you...,im coming borderlands murder,im come borderland murder
3,Borderlands,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder,im get borderland 2 murder
4,Borderlands,im getting into borderlands and i can murder y...,im getting borderlands murder,im get borderland murder


In [22]:
# Lemmatizing
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

WordNetLemmatizer =WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize(x):
    # find pos tags
    pos_text = pos_tag(x.split())
    return " ".join([WordNetLemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

In [23]:
df['lemmatized_tweet'] = df['clean_tweet'].apply(lambda y: lemmatize(y))
df.head()

Unnamed: 0,Subject,tweet,clean_tweet,stemmed_tweet,lemmatized_tweet
0,Borderlands,I am coming to the borders and I will kill you...,coming borders kill,come border kill,come border kill
1,Borderlands,im getting on borderlands and i will kill you ...,im getting borderlands kill,im get borderland kill,im get borderland kill
2,Borderlands,im coming on borderlands and i will murder you...,im coming borderlands murder,im come borderland murder,im come borderland murder
3,Borderlands,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder,im get borderland 2 murder,im get borderland 2 murder
4,Borderlands,im getting into borderlands and i can murder y...,im getting borderlands murder,im get borderland murder,im get borderland murder


In [24]:
total_word_count = 0
for content in df['lemmatized_tweet']:
    total_word_count += sum(1 for _ in content.split())
print(total_word_count)

832931


In [25]:
# Tagging Parts of Speech

from nltk.tokenize import word_tokenize

def tokenize_text(x):
    return word_tokenize(x)

tok = df['lemmatized_tweet'].apply(lambda y: tokenize_text(y))


In [26]:
def apply_pos_tagging(tokens):
    return pos_tag(tokens)

tps= tok.apply(lambda y: apply_pos_tagging(y))

print(tps)

0                  [(come, VBN), (border, NN), (kill, NN)]
1        [(im, NN), (get, NN), (borderland, NN), (kill,...
2        [(im, JJ), (come, VBN), (borderland, NN), (mur...
3        [(im, NN), (get, NN), (borderland, VBP), (2, C...
4        [(im, NN), (get, NN), (borderland, NN), (murde...
                               ...                        
74676    [(realize, VB), (window, JJ), (partition, NN),...
74677    [(realize, VB), (mac, NN), (window, JJ), (part...
74678    [(realize, VB), (window, JJ), (partition, NN),...
74679    [(realize, VB), (window, JJ), (partition, NN),...
74680    [(like, IN), (window, JJS), (partition, NN), (...
Name: lemmatized_tweet, Length: 74681, dtype: object


In [27]:
# Named-Entity Recognition

import spacy
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [28]:
def entities(x):
    doc = nlp(x)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

ndr = df['lemmatized_tweet'].apply(entities)

print(ndr)

0                      []
1                      []
2                      []
3         [(2, CARDINAL)]
4                      []
               ...       
74676    [(6 year, DATE)]
74677    [(6 year, DATE)]
74678    [(6 year, DATE)]
74679    [(6 year, DATE)]
74680    [(6 year, DATE)]
Name: lemmatized_tweet, Length: 74681, dtype: object


In [29]:
# Dependency Parsing
def dependencies(x):
    doc = nlp(x)
    dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
    return dependencies

par = df['lemmatized_tweet'].apply(lambda y: dependencies(y))

print(par)

0        [(come, ROOT, come), (border, compound, kill),...
1        [(i, nsubj, get), (m, aux, get), (get, ROOT, g...
2        [(i, nsubjpass, come), (m, auxpass, come), (co...
3        [(i, nsubj, get), (m, aux, get), (get, ROOT, g...
4        [(i, nsubj, get), (m, aux, get), (get, ROOT, g...
                               ...                        
74676    [(realize, ROOT, realize), (window, compound, ...
74677    [(realize, ROOT, realize), (mac, compound, par...
74678    [(realize, ROOT, realize), (window, compound, ...
74679    [(realize, ROOT, realize), (window, compound, ...
74680    [(like, prep, notice), (window, compound, part...
Name: lemmatized_tweet, Length: 74681, dtype: object


In [30]:
# Rule-Based Matching 
from spacy.matcher import Matcher
match = Matcher(nlp.vocab)
pattern = [{'LOWER': 'covid19'}]
match.add("covid19Pattern", [pattern])

In [31]:
def matching(x):
    doc = nlp(x)
    matches = match(doc)
    matched_tokens = [doc[start:end].text for match_id, start, end in matches]
    return matched_tokens

m = df['lemmatized_tweet'].apply(lambda y: matching(y))

print(m)

0        []
1        []
2        []
3        []
4        []
         ..
74676    []
74677    []
74678    []
74679    []
74680    []
Name: lemmatized_tweet, Length: 74681, dtype: object


In [32]:
X=df['lemmatized_tweet']

In [33]:
# Vectorization tf_idf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [34]:
X = tfidf.fit_transform(X)


In [35]:
# split
from sklearn.model_selection import train_test_split
X_train , X_test , Y_train , Y_test  = train_test_split(X, y, test_size = 0.2 ,  random_state = 42)

In [36]:
# Model
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [37]:
model = SVC()
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9005824462743522
