In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [3]:
column_names = ["target","ids","date","flag","user","text"]
data = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv",names = column_names, encoding = "ISO-8859-1")

In [4]:
data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
data.shape

(1600000, 6)

In [6]:
data.isnull().sum()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

In [7]:
data["target"].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [8]:
data.replace({"target": {4: 1}}, inplace=True)

In [9]:
data["target"].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [10]:
wnl = WordNetLemmatizer()

In [11]:
def stem(txt):
    lemmatized_txt = re.sub("[^a-zA-Z]"," ",txt)
    lemmatized_txt = lemmatized_txt.lower()
    lemmatized_txt = lemmatized_txt.split()
    lemmatized_txt = [wnl.lemmatize(word) for word in lemmatized_txt if not word in stopwords.words("english")]
    lemmatized_txt = " ".join(lemmatized_txt)
    
    return lemmatized_txt

In [12]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

In [14]:
from tqdm import tqdm
tqdm.pandas(desc="Processing", total=len(data))
data["lemmatized_text"] = data["text"].progress_apply(stem)

Processing: 100%|██████████| 1600000/1600000 [57:41<00:00, 462.24it/s] 


In [15]:
data.head()

Unnamed: 0,target,ids,date,flag,user,text,lemmatized_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset update facebook texting might cry result...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dived many time ball managed save res...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving mad see


In [16]:
x = data["lemmatized_text"].values
y = data["target"].values

In [17]:
x

array(['switchfoot http twitpic com zl awww bummer shoulda got david carr third day',
       'upset update facebook texting might cry result school today also blah',
       'kenichan dived many time ball managed save rest go bound', ...,
       'ready mojo makeover ask detail',
       'happy th birthday boo alll time tupac amaru shakur',
       'happy charitytuesday thenspcc sparkscharity speakinguph h'],
      dtype=object)

In [18]:
y

array([0, 0, 0, ..., 1, 1, 1])

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1, test_size = 0.2, stratify = y)

In [20]:
print(x_train.shape, x_test.shape)

(1280000,) (320000,)


In [21]:
x_train

array(['rise shining lol min later planned rush door time',
       'nd interview today looking promising', 'emilyalbracht feel pain',
       ..., 'bookwitter welcome change mind though let know',
       'howcoza bet bring backup',
       'window linux box installing bsd apple ipod yeah'], dtype=object)

In [22]:
vect = TfidfVectorizer()
x_train = vect.fit_transform(x_train)
x_test = vect.transform(x_test)

In [23]:
print(x_train)

  (0, 440530)	0.1929681548681218
  (0, 115853)	0.3518571199454984
  (0, 374905)	0.40589961680104825
  (0, 341624)	0.38084348419831615
  (0, 245631)	0.27669585876387015
  (0, 286328)	0.31815241708182246
  (0, 257973)	0.19844297369533787
  (0, 392919)	0.38207537217834686
  (0, 368111)	0.4074565466261997
  (1, 349064)	0.6217889111761643
  (1, 258815)	0.35724828017937604
  (1, 443093)	0.25750987992262947
  (1, 197034)	0.47206593794010765
  (1, 307132)	0.4433899279710779
  (2, 329698)	0.41904009185276003
  (2, 141222)	0.29689738502585433
  (2, 128370)	0.8580543946538821
  (3, 16205)	0.4381368190504687
  (3, 263669)	0.24100633274062971
  (3, 148529)	0.19960061504481333
  (3, 180084)	0.3205635672139369
  (3, 16182)	0.14787807029030983
  (3, 136738)	0.32160044516814057
  (3, 473416)	0.1860146591983528
  (3, 404853)	0.23918557392607273
  :	:
  (1279995, 176980)	0.4850191694142752
  (1279995, 48118)	0.46036176375173327
  (1279995, 443363)	0.35909557801495534
  (1279995, 16182)	0.2357160277626553

In [26]:
from sklearn.linear_model import LogisticRegression

In [37]:
final_model = LogisticRegression(max_iter = 2000,verbose = 2)
final_model.fit(x_train, y_train)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =       496911     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  8.87228D+05    |proj g|=  2.90605D+03


 This problem is unconstrained.



At iterate    1    f=  8.30808D+05    |proj g|=  1.24120D+04

At iterate    2    f=  8.20224D+05    |proj g|=  7.61656D+04

At iterate    3    f=  7.37073D+05    |proj g|=  9.75545D+03

At iterate    4    f=  7.26566D+05    |proj g|=  2.84728D+03

At iterate    5    f=  7.18748D+05    |proj g|=  1.64964D+03

At iterate    6    f=  6.92072D+05    |proj g|=  1.39761D+03

At iterate    7    f=  6.51851D+05    |proj g|=  8.21894D+03

At iterate    8    f=  6.45765D+05    |proj g|=  3.46548D+03

At iterate    9    f=  6.39193D+05    |proj g|=  4.58121D+03

At iterate   10    f=  6.36074D+05    |proj g|=  6.32363D+03

At iterate   11    f=  6.32635D+05    |proj g|=  4.56575D+03

At iterate   12    f=  6.28244D+05    |proj g|=  5.97889D+02

At iterate   13    f=  6.27749D+05    |proj g|=  1.22075D+04

At iterate   14    f=  6.25508D+05    |proj g|=  1.55444D+03

At iterate   15    f=  6.24021D+05    |proj g|=  3.40055D+03

At iterate   16    f=  6.22487D+05    |proj g|=  5.17115D+03

At iter

In [38]:
x_train_pred = final_model.predict(x_train)
print(accuracy_score(y_train, x_train_pred))

0.81514609375


In [39]:
x_test_pred = final_model.predict(x_test)
print(accuracy_score(y_test, x_test_pred))

0.782696875


In [40]:
import pickle
pickle.dump(final_model, open("/kaggle/working/random_forest.pkl","wb"))

In [41]:
new_model = pickle.load(open("/kaggle/working/random_forest.pkl","rb"))

In [47]:
print(new_model.predict(x_test[4000]),y_test[4000])

[0] 0


In [48]:
pickle.dump(vect, open("/kaggle/working/vectorizer.pkl", "wb"))