In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('filter_twitter_data.csv')

In [3]:
df.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [18]:
df.isnull().sum()

target               0
id                   0
date                 0
flag                 0
user                 0
text                 0
stemmed_content    495
dtype: int64

In [20]:
df['stemmed_content'] = df['stemmed_content'].fillna("")

In [21]:
df.isnull().sum()

target             0
id                 0
date               0
flag               0
user               0
text               0
stemmed_content    0
dtype: int64

In [22]:
X = df['stemmed_content'].values
X

array(['switchfoot http twitpic com zl awww bummer shoulda got david carr third day',
       'upset updat facebook text might cri result school today also blah',
       'kenichan dive mani time ball manag save rest go bound', ...,
       'readi mojo makeov ask detail',
       'happi th birthday boo alll time tupac amaru shakur',
       'happi charitytuesday thenspcc sparkschar speakinguph h'],
      shape=(1600000,), dtype=object)

In [23]:
y = df['target'].values
y

array([0, 0, 0, ..., 1, 1, 1], shape=(1600000,))

In [24]:
df['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [34]:
df['stemmed_content'].count()

np.int64(1600000)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [27]:
X_train

array(['eringifford pleas give swift kick ars chang ever hit big',
       'alexbach pretti awesom handl',
       'neck red like boil lobster give massiv amount heat', ...,
       'photovia tiresom love wake photo natur http tumblr com xek w p',
       'bvonros bout time someon els join twittervers',
       'bought chocol bar quot win free bar quot label win either'],
      shape=(1120000,), dtype=object)

In [29]:
# converting word into vector
vectorizer = TfidfVectorizer()

In [30]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [32]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8270623 stored elements and shape (1120000, 422699)>
  Coords	Values
  (0, 112037)	0.5651556589513604
  (0, 291637)	0.22264901619038097
  (0, 136062)	0.2364424015472906
  (0, 359280)	0.3450902555028528
  (0, 198336)	0.28665036448368497
  (0, 21527)	0.3635478715103337
  (0, 61034)	0.2530840904948045
  (0, 114162)	0.23079857294575395
  (0, 153109)	0.26109664702445023
  (0, 37734)	0.2370816726175219
  (1, 8791)	0.7658890833591377
  (1, 295816)	0.3236508208308959
  (1, 26374)	0.3029474218759125
  (1, 145600)	0.46571119564116675
  (2, 136062)	0.25234168398086554
  (2, 262411)	0.3480512551250672
  (2, 307524)	0.2943887324780582
  (2, 215037)	0.16897603366597974
  (2, 43171)	0.39909724657997897
  (2, 219324)	0.4055713900329562
  (2, 233325)	0.35803608369152534
  (2, 13646)	0.3722922363896325
  (2, 148748)	0.330769196609809
  (3, 149529)	0.5460720186836305
  (3, 104204)	0.5754627156511894
  :	:
  (1119997, 72146)	0.16364990870602078

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "SVM": LinearSVC(),
}

In [37]:
trained_models = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {acc:.3f}")
    trained_models[name] = model

Logistic Regression Accuracy: 0.777
Naive Bayes Accuracy: 0.755
SVM Accuracy: 0.769


In [38]:
trained_models

{'Logistic Regression': LogisticRegression(max_iter=1000),
 'Naive Bayes': MultinomialNB(),
 'SVM': LinearSVC()}

In [39]:
import pickle

with open('Log_ML_model.pkl', 'wb') as file:
    pickle.dump(trained_models['Logistic Regression'], file)

with open('tfidf.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)