In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import seaborn as sns

In [2]:
np.random.seed(500)

In [3]:
Corpus = pd.read_csv(r"../input/olidtrain/olid-training-v1.0.tsv", sep='\t')



In [4]:
Corpus[Corpus["subtask_a"] == "OFF"]

In [5]:

# Step - a : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['tweet'] = [entry.lower() for entry in Corpus['tweet']]
# Step - b : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['tweet']= [word_tokenize(entry) for entry in Corpus['tweet']]
# Step - c : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['tweet']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

In [6]:
Corpus

In [7]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['subtask_a'],test_size=0.25)

In [8]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [9]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [10]:

SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(Test_Y, predictions_SVM)
ax = sns.heatmap(cf_matrix, cmap='Blues')
cf_matrix

In [11]:
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

In [12]:
f1_arr = f1_score(Test_Y, predictions_SVM, average="macro")
print("SVM F1 Score :",f1_arr )

In [13]:
arr = ["#HappyBdayPMModi One may hate him. But u should agree that he is the most hard working PM India has ever witnessed.😊 URL"]
arr = [entry.lower() for entry in arr]
#Tokenization : In this each entry in the corpus will be broken into set of words
arr= [word_tokenize(entry) for entry in arr]
#Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(arr):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        print(word,tag)
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])            
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    arr = Final_words
print(arr)


In [14]:
Test_Tfidf = Tfidf_vect.transform(arr)

In [15]:
SVM.predict(Test_Tfidf)

In [16]:
from sklearn.metrics import matthews_corrcoef

In [17]:
matthews_corrcoef(predictions_SVM, Test_Y)