In [1]:
# Import libraries.
import numpy as np
import pandas as pd
import string

from sklearn.feature_selection import f_regression, RFE, SelectKBest
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, mean_absolute_error, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load data.
df = pd.read_csv('first_week_oct_2015_comments_by_top_400_with_scores_and_features_v2.csv')

In [3]:
# Preview data.
df.head()

Unnamed: 0,id,parent,by,time,hour_posted,text,dead,ranking,text_len,pct_caps,...,newlines_per_char,papi_toxicity,v_neg,v_neu,v_pos,v_compound,tb_polarity,tb_subjectivity,tb_nb_prob_neg,pc_prob_offensive
0,10331981,10331895,debacle,2015-10-05 14:24:42+00:00,14,US is not really scared by BRICS at all. They'...,False,0,146,0.068493,...,0,0.100881,0.0,0.744,0.256,0.7859,0.15625,0.61875,0.080001,0.238871
1,10343811,10343761,sarciszewski,2015-10-07 02:13:15+00:00,2,"I wasn't really trying to argue, they said the...",False,0,76,0.013158,...,0,0.048637,0.195,0.805,0.0,-0.3947,0.2,0.2,0.593498,0.050161
2,10331538,10331008,debacle,2015-10-05 13:08:10+00:00,13,The examples on the homepage kind of underscor...,False,12,88,0.034091,...,0,0.044777,0.0,0.864,0.136,0.2975,0.1375,0.5,0.336201,0.098511
3,10340097,10339965,debacle,2015-10-06 16:33:06+00:00,16,No mention of a critical aspect of a service l...,False,22,99,0.010101,...,0,0.035335,0.214,0.667,0.119,-0.25,0.0,0.8,0.352414,0.056323
4,10338552,10337763,debacle,2015-10-06 13:06:26+00:00,13,I think some of these points are gross exagger...,False,38,868,0.013825,...,0,0.232577,0.081,0.902,0.017,-0.8233,0.076667,0.26,0.00079,0.001499


In [4]:
def avg_word_len(text):
    words = text.split()
    return sum(len(word) for word in 
               filter(lambda word: word[:4] != 'http', words)) / len(words)

In [5]:
df['avg_word_len'] = df['text'].apply(avg_word_len)

In [6]:
df['avg_word_len'].describe()

count    9970.000000
mean        4.736865
std         0.729452
min         0.000000
25%         4.410371
50%         4.714286
75%         5.040000
max        24.000000
Name: avg_word_len, dtype: float64

In [7]:
# Examine comments with shortest average word lengths.
for text in df.sort_values(by='avg_word_len')['text'].head(10):
    print(text, '\n')

https://news.ycombinator.com/item?id=10305855 

http://voxinc.typepad.com/thecustomer/files/cheat_sheet.htm 

https://supporters.eff.org/donate 

https://en.wikipedia.org/wiki/Plaza_Accord 

https://www.youtube.com/watch?v=E3s-qZsjK8I 

http://ridetheclown.com/wp/saver2/ 

https://news.ycombinator.com/item?id=10207454 

https://news.ycombinator.com/item?id=10307163 

https://news.ycombinator.com/item?id=10295658 

https://www.youtube.com/watch?v=5rQLLpgxjvs 



Excluding links, these comments have no words at all.

In [8]:
# Examine comments with longest average word lengths.
for text in df.sort_values(by='avg_word_len', ascending=False)['text'].head(10):
    print(text, '\n')

www.cantheyseemydick.com 

pigeon@tutanota.com 

When i log in and view source I get back a little bit of json only {"_id":"560ee464ba6b14a61c74de66","name":"Untitled","data":null,"ownerId":"560ee461ba6b14a61c74de65","javascript":"","__v":0,"opened":"2015-10-02T20:09:09.119Z","modified":"2015-10-02T20:09:08.569Z","created":"2015-10-02T20:09:08.569Z","externalLibraries":[]} how are you doing that ? 

Browser/version/OS? Thanks! 

That's a lot of code. curl -s "https://en.wikipedia.org/w/api.php?action=parse&contentmodel=wikitext&format=json&redirects&explaintext&prop=text§ion=0&page=golang" | jq .parse.text.\"*\" > temp.html; links temp.html (Scroll that). Not quite the same, but... 

How... unreadable. Congratulations Twitter? 

Examples, please. Particularly software innovations. 

Does anyone know how they did their chemistry representation? The entry at http://reference.wolfram.com/language/ref/ChemicalData.html for Aspartame has a charge separated SMILES of COC(=O)C(CC1=CC=CC=C1)NC

In [9]:
df.head()

Unnamed: 0,id,parent,by,time,hour_posted,text,dead,ranking,text_len,pct_caps,...,papi_toxicity,v_neg,v_neu,v_pos,v_compound,tb_polarity,tb_subjectivity,tb_nb_prob_neg,pc_prob_offensive,avg_word_len
0,10331981,10331895,debacle,2015-10-05 14:24:42+00:00,14,US is not really scared by BRICS at all. They'...,False,0,146,0.068493,...,0.100881,0.0,0.744,0.256,0.7859,0.15625,0.61875,0.080001,0.238871,4.653846
1,10343811,10343761,sarciszewski,2015-10-07 02:13:15+00:00,2,"I wasn't really trying to argue, they said the...",False,0,76,0.013158,...,0.048637,0.195,0.805,0.0,-0.3947,0.2,0.2,0.593498,0.050161,4.923077
2,10331538,10331008,debacle,2015-10-05 13:08:10+00:00,13,The examples on the homepage kind of underscor...,False,12,88,0.034091,...,0.044777,0.0,0.864,0.136,0.2975,0.1375,0.5,0.336201,0.098511,4.5625
3,10340097,10339965,debacle,2015-10-06 16:33:06+00:00,16,No mention of a critical aspect of a service l...,False,22,99,0.010101,...,0.035335,0.214,0.667,0.119,-0.25,0.0,0.8,0.352414,0.056323,4.263158
4,10338552,10337763,debacle,2015-10-06 13:06:26+00:00,13,I think some of these points are gross exagger...,False,38,868,0.013825,...,0.232577,0.081,0.902,0.017,-0.8233,0.076667,0.26,0.00079,0.001499,5.297101


In [10]:
candidates = ['hour_posted', 'text_len', 'avg_word_len', 'pct_caps', 
              'tags_per_char', 
              'v_neg', 'v_neu', 'v_pos', 'v_compound', 
              'tb_polarity', 'tb_subjectivity', 'tb_nb_prob_neg', 
              'pc_prob_offensive']

In [11]:
X = df[candidates]
y = df['papi_toxicity']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

In [13]:
scaler = MinMaxScaler()

In [14]:
%%capture
X_train[candidates] = scaler.fit_transform(X_train)

In [15]:
X_train.describe()

Unnamed: 0,hour_posted,text_len,avg_word_len,pct_caps,tags_per_char,v_neg,v_neu,v_pos,v_compound,tb_polarity,tb_subjectivity,tb_nb_prob_neg,pc_prob_offensive
count,7976.0,7976.0,7976.0,7976.0,7976.0,7976.0,7976.0,7976.0,7976.0,7976.0,7976.0,7976.0,7976.0
mean,0.593362,0.044762,0.249299,0.027973,0.027404,0.083194,0.827302,0.104272,0.571644,0.546508,0.450752,0.278648,0.068112
std,0.286732,0.053882,0.036856,0.027859,0.057738,0.100145,0.124401,0.106107,0.277975,0.103709,0.233298,0.292911,0.106917
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.391304,0.013973,0.232456,0.013885,0.0,0.0,0.763,0.023,0.363163,0.5,0.339926,0.014155,0.010125
50%,0.652174,0.028059,0.24812,0.021277,0.0,0.057247,0.838,0.088,0.576527,0.54,0.475,0.16887,0.039272
75%,0.826087,0.055095,0.26555,0.033613,0.03986,0.127893,0.909,0.147,0.815716,0.6,0.585714,0.492991,0.084964
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
%%capture
X_test[candidates] = scaler.transform(X_test)

## F-scores

In [17]:
best = SelectKBest(score_func=f_regression, k='all')

In [18]:
fit = best.fit(X_train, y_train)

In [19]:
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature', 'F-Score']  #naming the dataframe columns
featureScores.sort_values('F-Score', ascending=False)

Unnamed: 0,Feature,F-Score
12,pc_prob_offensive,2923.899184
5,v_neg,1126.648971
8,v_compound,606.650239
9,tb_polarity,249.135958
6,v_neu,224.771518
1,text_len,163.890759
10,tb_subjectivity,107.558141
7,v_pos,48.318749
3,pct_caps,23.393416
11,tb_nb_prob_neg,15.378748


In [20]:
y_train.shape

(7976,)

## Recursive Feature Elimination

In [21]:
model = LinearRegression()

In [22]:
rfe = RFE(model, 5)
fit = rfe.fit(X_train, y_train)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
X_train.columns[fit.support_]

Num Features: 5
Selected Features: [False  True False  True  True  True False False False False False False
  True]
Feature Ranking: [6 1 7 1 1 1 8 5 4 2 3 9 1]


Index(['text_len', 'pct_caps', 'tags_per_char', 'v_neg', 'pc_prob_offensive'], dtype='object')

In [23]:
rfe = RFE(model, 4)
fit = rfe.fit(X_train, y_train)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
X_train.columns[fit.support_]

Num Features: 4
Selected Features: [False  True False  True False  True False False False False False False
  True]
Feature Ranking: [ 7  1  8  1  2  1  9  6  5  3  4 10  1]


Index(['text_len', 'pct_caps', 'v_neg', 'pc_prob_offensive'], dtype='object')

In [24]:
rfe = RFE(model, 3)
fit = rfe.fit(X_train, y_train)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))
X_train.columns[fit.support_]

Num Features: 3
Selected Features: [False  True False False False  True False False False False False False
  True]
Feature Ranking: [ 8  1  9  2  3  1 10  7  6  4  5 11  1]


Index(['text_len', 'v_neg', 'pc_prob_offensive'], dtype='object')

In [25]:
features = ['text_len', 'v_neg', 'pc_prob_offensive']

## Linear Regression

In [26]:
model.fit(X_train[features], y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [27]:
pred_train = model.predict(X_train[features])

In [28]:
pd.DataFrame(pred_train).describe()

Unnamed: 0,0
count,7976.0
mean,0.150089
std,0.085147
min,0.062677
25%,0.102857
50%,0.127615
75%,0.16668
max,0.918178


In [29]:
pred_test = model.predict(X_test[features])

In [30]:
print('Train MAE: ', mean_absolute_error(pred_train, y_train))
print('Test MAE: ', mean_absolute_error(pred_test, y_test))

Train MAE:  0.07648005868977967
Test MAE:  0.07573983132555266


In [31]:
# Is that a lot of error or a little, given the scale and
# variance of our target variable?
y_train.describe()

count    7976.000000
mean        0.150089
std         0.138293
min         0.004981
25%         0.066070
50%         0.102494
75%         0.188944
max         0.976457
Name: papi_toxicity, dtype: float64

In [32]:
# What about compared to a baseline baseline model, just predicting 
# the mean toxicity?
mean_pred_train = np.full_like(y_train, y_train.mean())
mean_pred_test = np.full_like(y_test, y_train.mean())

In [33]:
print('Train MAE (predicting mean): ', mean_absolute_error(mean_pred_train, y_train))
print('Test MAE (predicting mean): ', mean_absolute_error(mean_pred_test, y_test))

Train MAE (predicting mean):  0.09738346823118572
Test MAE (predicting mean):  0.09508224118804677


## Logistic Regression

In [34]:
# Convert PerspectiveAPI toxicity scores to binary class labels.
y_train_binary = (y_train > 0.7).astype(int)
y_test_binary = (y_test > 0.7).astype(int)

In [35]:
# Examine class balance in training dataset.
y_train_binary.value_counts(normalize=True)

0    0.989594
1    0.010406
Name: papi_toxicity, dtype: float64

In [36]:
model = LogisticRegression(class_weight='balanced')

In [37]:
model.fit(X_train[features], y_train_binary)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [38]:
pred_train = model.predict(X_train[features])
pred_test = model.predict(X_test[features])

In [39]:
np.bincount(pred_train)

array([7493,  483], dtype=int64)

In [40]:
# Calculate metrics on training dataset.
train_accuracy = accuracy_score(y_train_binary, pred_train)
train_recall = recall_score(y_train_binary, pred_train)
train_precision = precision_score(y_train_binary, pred_train)

# Calculate metrics on test dataset.
test_accuracy = accuracy_score(y_test_binary, pred_test)
test_recall = recall_score(y_test_binary, pred_test)
test_precision = precision_score(y_test_binary, pred_test)

In [41]:
# Display metrics for training dataset.
print(f'Train accuracy: {train_accuracy:.4f}')
print(f'Train recall: {train_recall:.4f}')
print(f'Train precision: {train_precision:.4f}', '\n')

# Display metrics for test dataset.
print(f'Test accuracy: {test_accuracy:.4f}')
print(f'Test recall: {test_recall:.4f}')
print(f'Test precision: {test_precision:.4f}')

Train accuracy: 0.9468
Train recall: 0.8554
Train precision: 0.1470 

Test accuracy: 0.9498
Test recall: 0.7000
Test precision: 0.1296
