In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer,word_tokenize


In [2]:
train=pd.read_csv('train_2kmZucJ.csv')
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [3]:
train.shape

(7920, 3)

In [4]:

eng_stop_words=stopwords.words('english')
lem=WordNetLemmatizer()

In [5]:
def preprocess(tweet):
    final_tokens=' '
    token_no_url=re.sub(r'http\S+', ' ',tweet)
    token_no_spl= re.sub('[^A-Za-z]+', ' ', token_no_url) 
    
    tokens=word_tokenize(token_no_spl,language='english')
    filtered_tokens=[token.lower() for token in tokens if token.lower() not in eng_stop_words]
    lemmed_tokens=[lem.lemmatize(token,pos='v') for token in filtered_tokens]
    final_tokens=final_tokens.join(lemmed_tokens)
    return final_tokens
    
    

In [6]:
train['cleaned_tweets']=train['tweet'].apply(lambda x:preprocess(x))

['fingerprint', 'Pregnancy', 'Test', 'android', 'apps', 'beautiful', 'cute', 'health', 'igers', 'iphoneonly', 'iphonesia', 'iphone']
['Finally', 'a', 'transparant', 'silicon', 'case', 'Thanks', 'to', 'my', 'uncle', 'yay', 'Sony', 'Xperia', 'S', 'sonyexperias']
['We', 'love', 'this', 'Would', 'you', 'go', 'talk', 'makememories', 'unplug', 'relax', 'iphone', 'smartphone', 'wifi', 'connect']


In [7]:
train

Unnamed: 0,id,label,tweet,cleaned_tweets
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy test android apps beauti...
1,2,0,Finally a transparant silicon case ^^ Thanks t...,finally transparant silicon case thank uncle y...
2,3,0,We love this! Would you go? #talk #makememorie...,love would go talk makememories unplug relax i...
3,4,0,I'm wired I know I'm George I was made that wa...,wire know george make way iphone cute daventry...
4,5,1,What amazing service! Apple won't even talk to...,amaze service apple even talk question unless ...
...,...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...,live loud lol liveoutloud selfie smile sony mu...
7916,7917,0,We would like to wish you an amazing day! Make...,would like wish amaze day make every minute co...
7917,7918,0,Helping my lovely 90 year old neighbor with he...,help lovely year old neighbor ipad morning mak...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...,finally get smart pocket wifi stay connect any...


In [8]:
tweets=train['cleaned_tweets']
label=train['label']

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(tweets,label,test_size=.20,random_state=42)

In [10]:
tfidf=TfidfVectorizer()
x_train_mat=tfidf.fit_transform(x_train)
x_test_mat=tfidf.transform(x_test)

In [11]:
x_train_mat_df=pd.DataFrame(x_train_mat.toarray(),columns=tfidf.get_feature_names())
x_train_mat_df.head()

Unnamed: 0,aa,aag,aah,aalborg,aand,aapl,aaron,aaronbrandt,aarp,aarrrggghhhh,...,zunjndm,zurich,zwckahsl,zwcn,zx,zxw,zxzh,zzita,zzjvgtyaxl,zznj
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
tfidf.get_feature_names()

['aa',
 'aag',
 'aah',
 'aalborg',
 'aand',
 'aapl',
 'aaron',
 'aaronbrandt',
 'aarp',
 'aarrrggghhhh',
 'aashamsakal',
 'aaydojbfkq',
 'aayp',
 'ab',
 'abah',
 'abajournal',
 'abay',
 'abbs',
 'abc',
 'abdullahoashraf',
 'abe',
 'aber',
 'abercrombie',
 'abfad',
 'abfadofficial',
 'abi',
 'ability',
 'abit',
 'able',
 'ableton',
 'aboard',
 'aboutalook',
 'abouttime',
 'abouttonight',
 'abp',
 'abs',
 'abscbn',
 'absence',
 'absolute',
 'absolutely',
 'absurd',
 'abt',
 'abu',
 'abudhabi',
 'abuja',
 'abujacity',
 'abujafct',
 'abujapeople',
 'abujaphones',
 'abukamalyasinpic',
 'abuse',
 'ac',
 'academia',
 'academy',
 'acap',
 'acc',
 'accelerate',
 'accept',
 'acceptable',
 'accesorios',
 'access',
 'accessible',
 'accessoires',
 'accessories',
 'accessory',
 'accessorypic',
 'accident',
 'accidental',
 'accidentally',
 'accompaniment',
 'accomplish',
 'accord',
 'account',
 'accountpic',
 'accra',
 'acct',
 'accurate',
 'ace',
 'acedemy',
 'acer',
 'acesse',
 'achi',
 'achieve',


In [13]:
x_train

4252    cool car wash idea theisland bankholidaymonday...
4428    photo th birthday sony walkman thinkgeek nobod...
7374    ipads biggest pile fuck planet want throw fuck...
1410    yearbook hmmmmm instagram instagood together f...
7896    piss macbook crash apple company nothing apple...
                              ...                        
5226    shana tova jewish newyear everyone may new yea...
5390               sick buy new cell phone chargers apple
860     want download free iphone app today spread sty...
7603    photo nikosx iphone beach holiday bw iphone bl...
7270    get iphone hehe iphone apple new finally seb lose
Name: cleaned_tweets, Length: 6336, dtype: object

In [14]:
from sklearn.naive_bayes import GaussianNB
GNB=GaussianNB()
GNB.fit(x_train_mat.toarray(),y_train)

GaussianNB()

In [15]:
from sklearn.metrics import f1_score
pre=GNB.predict(x_test_mat.toarray())
print(f1_score(y_test,pre,average='weighted'))


0.8025426887187398


In [16]:
test=pd.read_csv('test_oJQbWVk.csv')
test.head()

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...


In [17]:
test['cleaned_tweets']=test['tweet'].apply(lambda x:preprocess(x))

In [18]:

test_mat=tfidf_transform(test['cleaned_tweets'])

NameError: name 'tfidf_transform' is not defined

In [None]:
test_pre=GNB.predict(test_mat.toarray())