In this lab, we will
- read our project data into a Pandas DataFrame
- write a function to compute simple features for each row of the data frame
- fit a LogisticRegression model to the data
- print the top coefficients
- compute measures of accuracy

I've given you starter code below. You should:
- First, try to get it to work with your data. It may require changing the load_data file to match the requirements of your data (e.g., what is the object you are classifying -- a tweet, a user, a news article?)
- Second, you should add additional features to the make_features function:
  - Be creative. It could be additional word features, or other meta data about the user, date, etc.
- As you try out different feature combinations, print out the coefficients and accuracy scores
- List any features that seem to improve accuracy. Why do you think that is?

In [232]:
from collections import Counter
import numpy as np
import pandas as pd
import re
import glob
import gzip
import json
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [239]:
def load_data(datafile):
    """
    Read your data into a single pandas dataframe where
    - each row is an instance to be classified
    (this could be a tweet, user, or news article, depending on your project)
    - there is a column called `label` which stores the class label (e.g., the true
      category for this row)
    """
    bots = []
    humans = []
    folder = ['/bots', '/humans']
    name = '/*.json.gz'
    for f in folder:
        paths = glob.glob(datafile + f + name)
        for p in paths:
            with gzip.open(p, 'r') as file:
                for line in file:
                    if f == folder[0]:
                        bots.append(json.loads(line))
                    elif f == folder[1]:
                        humans.append(json.loads(line))
    df_bots = pd.DataFrame(bots)[['screen_name','tweets','listed_count']]
    df_bots['label'] = 'bot'
    df_humans = pd.DataFrame(humans)[['screen_name','tweets','listed_count']]
    df_humans['label'] = 'human'
    frames = [df_bots, df_humans]
    df = pd.concat(frames)
    users = bots + humans
    tweets = [u['tweets'] for u in users]
    text = [d['full_text'] for t in tweets for d in t] 

#     tweets_avg_len = []
    tweets_avg_mentions = []
    tweets_avg_urls = []
    factor = 100
    for u in users:
        tweets = u['tweets'] # a list of dicts
        texts = [t['full_text'] for t in tweets]
#         avg_len = sum(map(len, texts))/len(texts)
#         tweets_avg_len.append(int(avg_len))
        count_mention = 0
        count_url = 0
        for s in texts:
            if 'http' in s:
                count_url+=1
            if '@' in s:
                count_mention+=1
        tweets_avg_urls.append(100 * count_url / len(texts))
        tweets_avg_mentions.append(100 * count_mention / len(texts))
#     df['tweets_avg_len'] = tweets_avg_len
    df['tweets_avg_urls'] = tweets_avg_urls
    df['tweets_avg_mentions'] = tweets_avg_mentions
    return df
# df = load_data('~/Dropbox/elevate/harassment/training_data/data.csv.gz')
df = load_data('/Users/sheepman/Downloads/bots/small')
df


Unnamed: 0,screen_name,tweets,listed_count,label,tweets_avg_urls,tweets_avg_mentions
0,carlos_eggbot,[{'created_at': 'Sat Jun 01 18:36:07 +0000 201...,0,bot,10.500000,0.000000
1,ecolo_ebooks,[{'created_at': 'Sat Jun 01 18:36:11 +0000 201...,2,bot,0.000000,0.000000
2,AllStarSMBot,[{'created_at': 'Sat Jun 01 18:36:28 +0000 201...,3,bot,0.000000,0.000000
3,saionji_en,[{'created_at': 'Sat Jun 01 18:36:52 +0000 201...,3,bot,0.000000,0.000000
4,KimClune,[{'created_at': 'Sat Jun 01 18:37:20 +0000 201...,329,bot,28.500000,2.500000
5,CatsDogsBOT,[{'created_at': 'Sat Jun 01 18:38:10 +0000 201...,3,bot,100.000000,0.000000
6,bluejovanka,[{'created_at': 'Sat Jun 01 18:38:14 +0000 201...,47,bot,37.688442,32.160804
7,anittavota4,[{'created_at': 'Sat Jun 01 18:39:19 +0000 201...,0,bot,0.000000,0.000000
8,justtraveluk,[{'created_at': 'Sat Jun 01 18:39:21 +0000 201...,11,bot,100.000000,0.000000
9,rhaudiencebot,[{'created_at': 'Sat Jun 01 18:40:08 +0000 201...,0,bot,0.000000,0.000000


In [240]:
# what is the distribution over class labels?
df.label.value_counts()
df.dtypes

screen_name             object
tweets                  object
listed_count             int64
label                   object
tweets_avg_urls        float64
tweets_avg_mentions    float64
dtype: object

In [241]:
def make_features(df):
    vec = DictVectorizer()
    feature_dicts = []
    labels_to_track = ['tweets_avg_urls', 'tweets_avg_mentions','listed_count']
    for i, row in df.iterrows():
        features = {}
        features['tweets_avg_urls'] = row['tweets_avg_urls']
        features['tweets_avg_mentions'] = row['tweets_avg_mentions']
        features['listed_count'] = row['listed_count']
        feature_dicts.append(features)
    X = vec.fit_transform(feature_dicts)
#     print(X)
    return X, vec

X, vec = make_features(df)


In [218]:
# what are dimensions of the feature matrix?
X.shape


(200, 3)

In [242]:
# what are the feature names?
# vocabulary_ is a dict from feature name to column index
vec.vocabulary_

{'tweets_avg_urls': 2, 'tweets_avg_mentions': 1, 'listed_count': 0}

In [243]:
# how often does each word occur?
for word, idx in vec.vocabulary_.items():
    print('%20s\t%d' % (word, X[:,idx].sum()))

     tweets_avg_urls	9149
 tweets_avg_mentions	7028
        listed_count	147695


In [244]:
# can also get a simple list of feature names:
vec.get_feature_names()

# e.g., first column is 'hate', second is 'love', etc.

['listed_count', 'tweets_avg_mentions', 'tweets_avg_urls']

In [245]:
# we'll first store the classes separately in a numpy array
y = np.array(df.label)
Counter(y)

Counter({'bot': 100, 'human': 100})

In [246]:
# to find the row indices with hostile label
np.where(y=='bot')[0]
# np.where(y=='human')[0]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [247]:
# store the class names
class_names = set(df.label)

In [248]:
# how often does each word appear in each class?
for word, idx in vec.vocabulary_.items():
    for class_name in class_names:
        class_idx = np.where(y==class_name)[0]
        print('%20s\t%20s\t%d' % (word, class_name, X[class_idx, idx].sum()))

     tweets_avg_urls	                 bot	3657
     tweets_avg_urls	               human	5491
 tweets_avg_mentions	                 bot	905
 tweets_avg_mentions	               human	6123
        listed_count	                 bot	2351
        listed_count	               human	145344


So, `you` appears more frequently in positive (hostile) class, and `love` appears more frequently in the negative (non-hostile) class.

In [249]:
# fit a LogisticRegression classifier.
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [250]:
# for binary classification, LogisticRegression stores a single coefficient vector
clf.coef_
# this would be a matrix for a multi-class probem.

array([[0.01363247, 0.06014975, 0.0070873 ]])

In [251]:
# for binary classification, the coefficients for the negative class is just the negative of the positive class.
coef = [-clf.coef_[0], clf.coef_[0]]
print(coef)

[array([-0.01363247, -0.06014975, -0.0070873 ]), array([0.01363247, 0.06014975, 0.0070873 ])]


In [252]:
for ci, class_name in enumerate(clf.classes_):
    print('coefficients for %s' % class_name)
    display(pd.DataFrame([coef[ci]], columns=vec.get_feature_names()))

coefficients for bot


Unnamed: 0,listed_count,tweets_avg_mentions,tweets_avg_urls
0,-0.013632,-0.06015,-0.007087


coefficients for human


Unnamed: 0,listed_count,tweets_avg_mentions,tweets_avg_urls
0,0.013632,0.06015,0.007087


In [253]:
# sort coefficients by class.
features = vec.get_feature_names()
for ci, class_name in enumerate(clf.classes_):
    print('top features for class %5s' % class_name)
    for fi in coef[ci].argsort()[::-1]: # descending order.
        print('%20s\t%.4f' % (features[fi], coef[ci][fi]))

top features for class   bot
     tweets_avg_urls	-0.0071
        listed_count	-0.0136
 tweets_avg_mentions	-0.0601
top features for class human
 tweets_avg_mentions	0.0601
        listed_count	0.0136
     tweets_avg_urls	0.0071


In [254]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
for train, test in kf.split(X):
    clf.fit(X[train], y[train])
    pred = clf.predict(X[test])
    accuracies.append(accuracy_score(y[test], pred))
    
    
print('accuracy over all cross-validation folds: %s' % str(accuracies))
print('mean=%.2f std=%.2f' % (np.mean(accuracies), np.std(accuracies)))

accuracy over all cross-validation folds: [0.975, 0.875, 0.85, 1.0, 0.85]
mean=0.91 std=0.06
