In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import wordnet
import nltk
from nltk import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,f1_score
import warnings
lem = WordNetLemmatizer()
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1')

In [3]:
df_copy = df.copy()

In [4]:
df.head(2)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...


In [5]:
data = df[['text','target']]

In [6]:
data.head(2)

Unnamed: 0,text,target
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0


In [7]:
data.target.unique()

array([0, 4])

In [8]:
data['target'] = np.where(data['target']==4,1,data['target'])

In [9]:
data.target.unique()

array([0, 1])

In [10]:
data.isna().sum()

text      0
target    0
dtype: int64

In [11]:
def contractions(s):
    s = re.sub(r"won't", "will not",s)
    s = re.sub(r"would't", "would not",s)
    s = re.sub(r"could't", "could not",s)
    s = re.sub(r"\'d", " would",s)
    s = re.sub(r"can\'t", "can not",s)
    s = re.sub(r"n\'t", " not", s)
    s= re.sub(r"\'re", " are", s)
    s = re.sub(r"\'s", " is", s)
    s = re.sub(r"\'ll", " will", s)
    s = re.sub(r"\'t", " not", s)
    s = re.sub(r"\'ve", " have", s)
    s = re.sub(r"\'m", " am", s)
    return s

In [12]:
def pos_tag_simplified(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [13]:
data['pre_process'] = data['text'].apply(lambda x: ' '.join(x.lower() for x in str(x).split()))

In [14]:
data['pre_process']=data['pre_process'].apply(lambda x:contractions(x))

In [15]:
data['pre_process'] = data['pre_process'].apply(lambda x: ' '.join([re.sub('[^A-z]+','',x) for x in nltk.word_tokenize(x)]))

In [16]:
data['pre_process'] = data['pre_process'].apply(lambda x: re.sub(' +',' ',x))

In [17]:
stop = nltk.corpus.stopwords.words('english')
data['pre_process'] = data['pre_process'].apply(lambda x: ' '.join([x for x in x.split() if x not in stop]))

In [18]:
tagged = nltk.pos_tag(data['pre_process'])

In [19]:
simple_tagged = list(map(lambda x:(x[0],pos_tag_simplified(x[1])),tagged))

In [21]:
lemmatized_ = [lem.lemmatize(i[0],i[1]) if i[1] != None else lem.lemmatize(i[0]) for i in simple_tagged]

In [22]:
data['pre_process'] = lemmatized_

In [23]:
x_train,x_test,y_train,y_test = train_test_split(data['pre_process'],data['target'],test_size=0.25,random_state=30)

In [24]:
vec = TfidfVectorizer(use_idf=True)

In [25]:
x_train = vec.fit_transform(x_train)
x_test = vec.transform(x_test)

In [26]:
clf = LinearSVC(random_state=0)

In [27]:
clf.fit(x_train,y_train)

In [28]:
pred_clf = clf.predict(x_test)

In [29]:
report = classification_report(y_test,pred_clf,output_dict=True)

In [30]:
report

{'0': {'precision': 0.7821054621827268,
  'recall': 0.7634722921977183,
  'f1-score': 0.7726765583428834,
  'support': 200467},
 '1': {'precision': 0.7679201601495773,
  'recall': 0.7863010128650398,
  'f1-score': 0.7770018967814145,
  'support': 199533},
 'accuracy': 0.77486,
 'macro avg': {'precision': 0.7750128111661521,
  'recall': 0.7748866525313791,
  'f1-score': 0.774839227562149,
  'support': 400000},
 'weighted avg': {'precision': 0.7750293725062758,
  'recall': 0.77486,
  'f1-score': 0.774834177729522,
  'support': 400000}}

#### Logistic Regression

In [31]:
model_log = LogisticRegression()

In [32]:
model_log.fit(x_train,y_train)

In [33]:
pred_log = model_log.predict(x_test)

In [34]:
report = classification_report(y_test,pred_log)

In [35]:
print(report)

              precision    recall  f1-score   support

           0       0.79      0.76      0.78    200467
           1       0.77      0.80      0.79    199533

    accuracy                           0.78    400000
   macro avg       0.78      0.78      0.78    400000
weighted avg       0.78      0.78      0.78    400000

