In [1]:
import numpy as np
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import svm
import pickle

In [2]:
train = pd.read_csv("training.csv",sep='\;')
train = train[["text","country_code"]]
train = train.drop_duplicates()
train["country"] = 0
train.loc[train["country_code"] == 'US', 'country'] = 1
train = train[["text", "country"]]
train

  return func(*args, **kwargs)


Unnamed: 0,text,country
0,What colour is that virus? #coronavirus https:...,0
1,I don’t know if @Travistritt is taking request...,1
2,Tiny minuscule droplets of #Covid19 🦠 can last...,0
3,Do you have an old computer 💻 sitting around a...,0
4,we pray God keeps us safe.. #BENNIE needs you ...,0
...,...,...
9995,@JeremyKonyndyk “The biggest variable in this ...,0
9996,Let's fight against Corona Virus. SAHAJAYOGA M...,0
9997,#IndiaFightsCorona Know about the answers to ...,0
9998,#prepper skills #CoronavirusPandemic https://...,1


In [3]:
vectorizer = CountVectorizer(lowercase = True,
                             strip_accents="ascii", 
                             analyzer='word', 
                             stop_words='english', 
#                              max_features= 150000, 
                             max_df = 0.4,
                             ngram_range=(1,1))
X_train = vectorizer.fit_transform(train["text"])
vectorizer.get_feature_names_out()

array(['00', '000', '00000375', ..., 'zzp6nrhjxp', 'zzsgy43rc0',
       'zzw1h5cvqj'], dtype=object)

In [4]:
X_train.shape

(9986, 38503)

In [5]:
# tf_transformer = TfidfTransformer(use_idf=False).fit(X_train)
# X_train_tf = tf_transformer.transform(X_train)
# X_train_tf.shape

In [6]:
tfidf_transformer = TfidfTransformer(use_idf=False, norm='l2',sublinear_tf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train)
X_train_tfidf.shape

(9986, 38503)

In [7]:
clf = MultinomialNB(alpha=0.476,fit_prior=False)
# clf = svm.SVC(C=1.0, kernel='rbf', gamma=0.1)

In [8]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', vectorizer), \
                     ('tfidf', tfidf_transformer),
                     ('clf', clf)])

In [9]:
text_clf.fit(train["text"], train["country"])

Pipeline(steps=[('vect',
                 CountVectorizer(max_df=0.4, stop_words='english',
                                 strip_accents='ascii')),
                ('tfidf', TfidfTransformer(sublinear_tf=True, use_idf=False)),
                ('clf', MultinomialNB(alpha=0.476, fit_prior=False))])

In [10]:
test = pd.read_csv("test.csv",sep='\;')
test = test[["text","country_code"]]
# test = test.drop_duplicates()
test["country"] = 0
test.loc[test["country_code"] == 'US', 'country'] = 1
test = test[["text", "country"]]
test

  return func(*args, **kwargs)


Unnamed: 0,text,country
0,Big trip 5 mi to north...checked out Yountvill...,1
1,I think that @GovRonDeSantis is handling this ...,1
2,@AdamSandler will you be having a ZOOM Seder t...,0
3,Choudhary Family keeping themselves busy with ...,0
4,we pray God keeps us safe.. #BENNIE needs you ...,0
...,...,...
29999,I still don’t understand what privilege has to...,1
30000,Just closed up shop for at least the next 30 d...,1
30001,See! Miracles do happen. The solis family was ...,1
30002,🙏🏾 this is getting out of hand. 🤔 Prayers up t...,0


In [11]:
predicted = text_clf.predict(test["text"].astype(str))
print("Accuracy: {}".format(np.mean(predicted == test["country"])))

Accuracy: 0.781962405012665


In [12]:
with open('text_clf.pickle', 'wb') as file:
    pickle.dump(text_clf, file)

In [13]:
with open('text_clf.pickle', 'rb') as f:
    model = pickle.load(f)

In [14]:
model.predict(test["text"].astype(str))

array([1, 1, 1, ..., 1, 0, 1])