# Politicians & celebrities


Decide if given tweets are by politicians, celebrities, biz&tech leaders or by internet platforms.

In [1]:
import csv

import pandas as pd
import sklearn.feature_extraction.text
import sklearn.model_selection
import sklearn.naive_bayes

In [2]:
data = pd.read_csv('contest_tweets.csv', index_col=0)

In [3]:
data

Unnamed: 0,text,handle,target
0,“The case for learned index structures” - repl...,benhamner,biz&tech
1,@Smerity Lock you in a black box with a window...,benhamner,biz&tech
2,What policy outcomes are you aiming to achieve...,benhamner,biz&tech
3,“Machine learning for systems and systems for ...,benhamner,biz&tech
4,From the number of talks @goodfellow_ian’s giv...,benhamner,biz&tech
...,...,...,...
64501,Life couldn't be better right now. 😊,ddlovato,celebrity
64502,First Monday back in action. I'd say 21.6 mile...,ddlovato,celebrity
64503,"Crime shows, buddy, snuggles = the perfect Sun...",ddlovato,celebrity
64504,❄️ http://t.co/sHCFdPpGPa,ddlovato,celebrity


In [4]:
X, Y = data.iloc[:, 0], data.iloc[:, -1]

In [5]:
X, Y

(0        “The case for learned index structures” - repl...
 1        @Smerity Lock you in a black box with a window...
 2        What policy outcomes are you aiming to achieve...
 3        “Machine learning for systems and systems for ...
 4        From the number of talks @goodfellow_ian’s giv...
                                ...                        
 64501                 Life couldn't be better right now. 😊
 64502    First Monday back in action. I'd say 21.6 mile...
 64503    Crime shows, buddy, snuggles = the perfect Sun...
 64504                            ❄️ http://t.co/sHCFdPpGPa
 64505                        ❤️❄️✈️ http://t.co/ixmB5lv17Z
 Name: text, Length: 32026, dtype: object, 0         biz&tech
 1         biz&tech
 2         biz&tech
 3         biz&tech
 4         biz&tech
            ...    
 64501    celebrity
 64502    celebrity
 64503    celebrity
 64504    celebrity
 64505    celebrity
 Name: target, Length: 32026, dtype: object)

In [6]:
train_X, test_X, train_Y, test_Y = sklearn.model_selection.train_test_split(
    X, Y, test_size=0.25)

In [7]:
question = pd.read_csv('contest_testset.csv', index_col=0)['text'].values
question.shape

(12424,)

## A simplest possible Naive Bayes

In [107]:
TRAINING_X, TRAINING_Y = X, Y

In [9]:
simplest_vectorizer = sklearn.feature_extraction.text.CountVectorizer()

In [10]:
simplest_word_vector = simplest_vectorizer.fit_transform(TRAINING_X)

In [11]:
simplest_vectorizer.get_feature_names()

['00',
 '000',
 '000km',
 '000s',
 '000x',
 '001',
 '002',
 '003',
 '004',
 '005',
 '006',
 '007',
 '007cigarjoe',
 '008',
 '009',
 '00a',
 '00c5moz4hc',
 '00gnh9nlet',
 '00gqyrkxuc',
 '00p',
 '00pm',
 '00tamznzez',
 '00z9tnhpra',
 '01',
 '010',
 '011',
 '012',
 '013',
 '014',
 '015',
 '01am',
 '01aqikgg',
 '01azsbwzwq',
 '01gmbwza',
 '01lm5ct5qv',
 '01pm',
 '01qnthblfq',
 '01qnthtmwy',
 '01sfi7gqus',
 '01wes',
 '01x5elmlm2',
 '01y5cdizjm',
 '01zn9ui',
 '02',
 '025bly9i0q',
 '0279xcscce',
 '02a29xtebt',
 '02cxehbvvi',
 '02gufwr4c7',
 '02nmqn3znq',
 '02pewzeehr',
 '03',
 '032c',
 '032yv6f',
 '03b9y1lhqf',
 '03h7uybzjg',
 '03kftbjyyg',
 '03zdchczml',
 '04',
 '04aewlx8gf',
 '04aqlinszv',
 '04dzntypwg',
 '04fehtxvfm',
 '04haprjhlj',
 '04i0itjhdy',
 '04pw1wrplw',
 '04qqpgylt7',
 '04yqg5w2su',
 '05',
 '05jdlouyop',
 '05jet1tz6h',
 '05k2kzdtvo',
 '05mrpp2g',
 '05ny0wjhkl',
 '05sjurjfbf',
 '05uftluzkr',
 '05xomtttxe',
 '05xwap3fmk',
 '06',
 '063keauoff',
 '06a7phesnm',
 '06pgpxme1k',
 '078c0wv

In [12]:
simplest_classifier = sklearn.naive_bayes.MultinomialNB(fit_prior=False)

In [13]:
simplest_classifier.fit(simplest_word_vector, TRAINING_Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False)

In [14]:
simplest_classifier.score(simplest_vectorizer.transform(test_X), test_Y)

0.9334332459098289

### The prediction itself

In [15]:
simplest_answer = simplest_classifier.predict(simplest_vectorizer.transform(question))
simplest_answer

array(['politician', 'celebrity', 'celebrity', ..., 'politician',
       'celebrity', 'celebrity'], dtype='<U16')

In [16]:
simplest_answer_df = pd.DataFrame.from_dict(
    {'target': [v for v in simplest_answer]})
simplest_answer_df

Unnamed: 0,target
0,politician
1,celebrity
2,celebrity
3,politician
4,politician
...,...
12419,celebrity
12420,celebrity
12421,politician
12422,celebrity


In [17]:
with open('simplest-result.csv', 'w') as f:
    f.write(
        simplest_answer_df.to_csv(index=True, index_label='id', quoting=csv.QUOTE_NONNUMERIC))

## Separate classifier for every target

### Dataset splitting

#### IDs extraction

In [284]:
politician_ids = TRAINING_Y[TRAINING_Y == 'politician'].index
politician_ids.shape

(6081,)

In [285]:
celebrity_ids = TRAINING_Y[TRAINING_Y == 'celebrity'].index
celebrity_ids.shape

(18377,)

In [286]:
biztech_ids = TRAINING_Y[TRAINING_Y == 'biz&tech'].index
biztech_ids.shape

(2701,)

In [287]:
internetplatform_ids = TRAINING_Y[TRAINING_Y == 'internetplatform'].index
internetplatform_ids.shape

(4867,)

#### Training targets preparation

In [288]:
politician_y = TRAINING_Y.copy()
politician_y[:] = 0
politician_y[politician_ids] = 1
np.sum(politician_y)

6081

In [289]:
celebrity_y = TRAINING_Y.copy()
celebrity_y[:] = 0
celebrity_y[celebrity_ids] = 1
np.sum(celebrity_y)

18377

In [290]:
biztech_y = TRAINING_Y.copy()
biztech_y[:] = 0
biztech_y[biztech_ids] = 1
np.sum(biztech_y)

2701

In [291]:
internetplatform_y = TRAINING_Y.copy()
internetplatform_y[:] = 0
internetplatform_y[internetplatform_ids] = 1
np.sum(internetplatform_y)

4867

### Classifiers training

In [292]:
politician_classifier = sklearn.naive_bayes.MultinomialNB(fit_prior=False)
celebrity_classifier = sklearn.naive_bayes.MultinomialNB(fit_prior=False)
biztech_classifier = sklearn.naive_bayes.MultinomialNB(fit_prior=False)
internetplatform_classifier = sklearn.naive_bayes.MultinomialNB(fit_prior=False)

In [293]:
separate_classes_vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    stop_words='english')
separate_classes_vector = separate_classes_vectorizer.fit_transform(TRAINING_X)

In [294]:
politician_classifier.fit(separate_classes_vector, politician_y)
celebrity_classifier.fit(separate_classes_vector, celebrity_y)
biztech_classifier.fit(separate_classes_vector, biztech_y)
internetplatform_classifier.fit(separate_classes_vector, internetplatform_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False)

### Prediction

In [295]:
def separate_classes_predict(question):
    preds = np.array([-42] * len(question))

    politician_preds = politician_classifier.predict_proba(
        separate_classes_vectorizer.transform(question))[:, 1]
    celebrity_preds = celebrity_classifier.predict_proba(
        separate_classes_vectorizer.transform(question))[:, 1]
    biztech_preds = biztech_classifier.predict_proba(
        separate_classes_vectorizer.transform(question))[:, 1]
    internetplatform_preds = internetplatform_classifier.predict_proba(
        separate_classes_vectorizer.transform(question))[:, 1]
    
    for idx, _ in enumerate(a):
        preds[idx] = np.argmax([
            politician_preds[idx], 
            celebrity_preds[idx],
            biztech_preds[idx],
            internetplatform_preds[idx]
        ])
    
    return preds

In [296]:
separate_classes_answer = separate_classes_predict(question)

separate_classes_answer_df = pd.DataFrame.from_dict(
    {'target': [v for v in separate_classes_answer]})

separate_classes_answer_df[separate_classes_answer_df['target'] == 0] = 'politician'
separate_classes_answer_df[separate_classes_answer_df['target'] == 1] = 'celebrity'
separate_classes_answer_df[separate_classes_answer_df['target'] == 2] = 'biz&tech'
separate_classes_answer_df[separate_classes_answer_df['target'] == 3] = 'internetplatform'

separate_classes_answer_df

Unnamed: 0,target
0,politician
1,celebrity
2,celebrity
3,politician
4,politician
...,...
12419,celebrity
12420,celebrity
12421,politician
12422,celebrity


In [297]:
with open('separate-classes-result.csv', 'w') as f:
    f.write(
        separate_classes_answer_df.to_csv(
            index=True, index_label='id', quoting=csv.QUOTE_NONNUMERIC))

In [298]:
separate_classes_vectorizer.stop_words_

set()