# Politicians & celebrities


Decide if given tweets are by politicians, celebrities, biz&tech leaders or by internet platforms.

In [189]:
import csv

import pandas as pd
import sklearn.feature_extraction.text
import sklearn.model_selection
import sklearn.naive_bayes

In [8]:
data = pd.read_csv('contest_tweets.csv', index_col=0)

In [13]:
data

Unnamed: 0,text,handle,target
0,“The case for learned index structures” - repl...,benhamner,biz&tech
1,@Smerity Lock you in a black box with a window...,benhamner,biz&tech
2,What policy outcomes are you aiming to achieve...,benhamner,biz&tech
3,“Machine learning for systems and systems for ...,benhamner,biz&tech
4,From the number of talks @goodfellow_ian’s giv...,benhamner,biz&tech
...,...,...,...
64501,Life couldn't be better right now. 😊,ddlovato,celebrity
64502,First Monday back in action. I'd say 21.6 mile...,ddlovato,celebrity
64503,"Crime shows, buddy, snuggles = the perfect Sun...",ddlovato,celebrity
64504,❄️ http://t.co/sHCFdPpGPa,ddlovato,celebrity


In [97]:
X, Y = data.iloc[:, 0], data.iloc[:, -1]

In [98]:
X, Y

(0        “The case for learned index structures” - repl...
 1        @Smerity Lock you in a black box with a window...
 2        What policy outcomes are you aiming to achieve...
 3        “Machine learning for systems and systems for ...
 4        From the number of talks @goodfellow_ian’s giv...
                                ...                        
 64501                 Life couldn't be better right now. 😊
 64502    First Monday back in action. I'd say 21.6 mile...
 64503    Crime shows, buddy, snuggles = the perfect Sun...
 64504                            ❄️ http://t.co/sHCFdPpGPa
 64505                        ❤️❄️✈️ http://t.co/ixmB5lv17Z
 Name: text, Length: 32026, dtype: object, 0         biz&tech
 1         biz&tech
 2         biz&tech
 3         biz&tech
 4         biz&tech
            ...    
 64501    celebrity
 64502    celebrity
 64503    celebrity
 64504    celebrity
 64505    celebrity
 Name: target, Length: 32026, dtype: object)

In [144]:
train_X, test_X, train_Y, test_Y = sklearn.model_selection.train_test_split(
    X, Y, test_size=0.25)

In [168]:
question = pd.read_csv('contest_testset.csv', index_col=0)['text'].values
question.shape

(12424,)

## A simplest possible Naive Bayes

In [145]:
simplest_vectorizer = sklearn.feature_extraction.text.CountVectorizer()

In [146]:
simplest_word_vector = simplest_vectorizer.fit_transform(train_X)

In [147]:
simplest_vectorizer.get_feature_names()

['00',
 '000',
 '000km',
 '000s',
 '000x',
 '001',
 '002',
 '003',
 '004',
 '005',
 '006',
 '007',
 '008',
 '009',
 '00a',
 '00c5moz4hc',
 '00gnh9nlet',
 '00p',
 '00pm',
 '00tamznzez',
 '00z9tnhpra',
 '01',
 '011',
 '013',
 '015',
 '01am',
 '01azsbwzwq',
 '01gmbwza',
 '01pm',
 '01qnthblfq',
 '01sfi7gqus',
 '01wes',
 '01x5elmlm2',
 '01y5cdizjm',
 '01zn9ui',
 '02',
 '025bly9i0q',
 '0279xcscce',
 '02a29xtebt',
 '02gufwr4c7',
 '02nmqn3znq',
 '02pewzeehr',
 '03',
 '032c',
 '032yv6f',
 '03b9y1lhqf',
 '03h7uybzjg',
 '03kftbjyyg',
 '03zdchczml',
 '04',
 '04aewlx8gf',
 '04dzntypwg',
 '04fehtxvfm',
 '04haprjhlj',
 '04i0itjhdy',
 '04pw1wrplw',
 '04qqpgylt7',
 '04yqg5w2su',
 '05',
 '05jdlouyop',
 '05jet1tz6h',
 '05k2kzdtvo',
 '05mrpp2g',
 '05ny0wjhkl',
 '05sjurjfbf',
 '05uftluzkr',
 '05xwap3fmk',
 '06',
 '063keauoff',
 '06a7phesnm',
 '06pgpxme1k',
 '078c0wvzxq',
 '07dli5e6zg',
 '07hwyljoy8',
 '07ldw7zg',
 '07p4dhmx',
 '07rquaresma',
 '07trstp590',
 '086j77qy58',
 '087',
 '088l9l2jd1',
 '08dyflqdsr

In [148]:
simplest_classifier = sklearn.naive_bayes.MultinomialNB(fit_prior=False)

In [149]:
simplest_classifier.fit(simplest_word_vector, train_Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False)

In [158]:
simplest_classifier.score(simplest_vectorizer.transform(test_X), test_Y)

0.8806044710877982

### The prediction itself

In [175]:
simplest_answer = simplest_classifier.predict(simplest_vectorizer.transform(question))
simplest_answer

array(['politician', 'celebrity', 'celebrity', ..., 'politician',
       'celebrity', 'celebrity'], dtype='<U16')

In [179]:
simplest_answer_df = pd.DataFrame.from_dict(
    {'target': [v for v in simplest_answer]})
simplest_answer_df

Unnamed: 0,target
0,politician
1,celebrity
2,celebrity
3,politician
4,politician
...,...
12419,celebrity
12420,celebrity
12421,politician
12422,celebrity


In [190]:
with open('simplest-result.csv', 'w') as f:
    f.write(
        simplest_answer_df.to_csv(index=True, index_label='id', quoting=csv.QUOTE_NONNUMERIC))