In [145]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [146]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,ID,Place,location,date,status,job_title,summary,positives,negatives,advice_to_mgmt,score_1,score_2,score_3,score_4,score_5,score_6,overall
0,1,startup_1,,"Dec 11, 2018",Current Employee,Anonymous Employee,Best Company to work for,People are smart and friendly,Bureaucracy is slowing things down,,4.0,5.0,5.0,4.0,5.0,0,5.0
1,2,startup_1,"Mountain View, CA","Jun 21, 2013",Former Employee,Program Manager,"Moving at the speed of light, burn out is inev...","1) Food, food, food. 15+ cafes on main campus ...",1) Work/life balance. What balance? All those ...,1) Don't dismiss emotional intelligence and ad...,2.0,3.0,3.0,5.0,3.0,2094,5.0
2,3,startup_1,"New York, NY","May 10, 2014",Current Employee,Software Engineer III,Great balance between big-company security and...,"* If you're a software engineer, you're among ...","* It *is* becoming larger, and with it comes g...",Keep the focus on the user. Everything else wi...,5.0,4.0,5.0,5.0,4.0,949,5.0
3,4,startup_1,"Mountain View, CA","Feb 8, 2015",Current Employee,Anonymous Employee,The best place I've worked and also the most d...,You can't find a more well-regarded company th...,I live in SF so the commute can take between 1...,Keep on NOT micromanaging - that is a huge ben...,2.0,5.0,5.0,4.0,5.0,498,4.0
4,10,startup_1,,"Dec 9, 2018",Current Employee,Anonymous Employee,Execellent for engineers,Impact driven. Best tech in the world.,Size matters. Engineers are a bit disconnected...,,5.0,5.0,5.0,5.0,5.0,0,4.0


In [149]:
score_cols =['score_1', 'score_2', 'score_3', 'score_4', 'score_5']
df['avg_score'] = df[score_cols].apply(np.mean, axis=1)

In [150]:
cols = ['positives', 'negatives', 'avg_score', 'overall']
X = df[cols]

In [151]:
numeric_cols = ['avg_score']
label = ['overall']

In [152]:
def combine_text_cols(df, to_drop=numeric_cols+label):
    to_drop = set(to_drop) & set(df.columns.tolist())
    text_data = df.drop(to_drop, axis=1)
    text_data.fillna('', inplace=True)
    return text_data.apply(lambda x:" ".join(x), axis=1)

In [153]:
df['reviews'] = combine_text_cols(X)

In [154]:
cols_data = ['reviews', 'avg_score', 'overall']
data = df[cols_data]

In [155]:
from nltk.corpus import stopwords
import re

In [156]:
def split_text(text):
    text = re.sub("[+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "", text)
    text = text.lower()
    return text.split()


In [157]:
def get_sentence_list(data, column):
    sentence_list = []
    stop = [re.sub("[+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "", word) for word in stopwords.words('english')]
    for i in range(len(data)):
        text = data.iloc[i][column]
        word_list = [word for word in split_text(text) if word not in stop]
        sentence_list.append(word_list)
    return sentence_list

In [158]:
def get_key_words(data, column='reviews'):
    s_list = get_sentence_list(data, column)
    word_dict = {}
    for sentence in s_list:
        for word in sentence:
            if word not in word_dict.keys():
                word_dict[word] = 1
            else:
                word_dict[word] += 1
    result = {'words' : {}, 'frequency' : {}}
    count = 0
    for word in word_dict.keys():
        result['words'][count] = word
        result['frequency'][count] = word_dict[word]
        count += 1
    return pd.DataFrame.from_dict(result)


In [159]:
def get_pos_and_neg_words(data):
    data_pos = data[data['overall'] >= 4]
    data_neg = data[data['overall'] <= 3]
    pos_words = list(get_key_words(data_pos).nlargest(100, 'frequency')['words'])
    neg_words = list(get_key_words(data_neg).nlargest(100, 'frequency')['words'])
    commendatory = [word for word in pos_words if word not in neg_words]
    derogatory = [word for word in neg_words if word not in pos_words]
    return commendatory, derogatory

In [160]:
commendatory, derogatory = get_pos_and_neg_words(data)

In [161]:
def encode_text_by_word_frequency(data, column='reviews'):
#    commendatory, derogatory = get_pos_and_neg_words(data)
    result = {'pos_word_frequency' : {}, 'neg_word_frequency' : {}}
    for i in range(len(data)):
        text = data.iloc[i][column]
        word_list = split_text(text)
        pos_count = 0
        neg_count = 0
        for word in word_list:
            if word in commendatory:
                pos_count += 1
            if word in derogatory:
                neg_count += 1
#        rating = data.iloc[i]['overall']
        """if rating >= 4:
            result['pos_word_frequency'][i] = pos_count
            result['neg_word_frequency'][i] = -neg_count
        elif rating <= 3:
            result['pos_word_frequency'][i] = -pos_count
            result['neg_word_frequency'][i] = neg_count"""
        result['pos_word_frequency'][i] = pos_count
        result['neg_word_frequency'][i] = neg_count
        
    result_df = pd.DataFrame.from_dict(result)
    return pd.concat([data.reset_index(drop=True), result_df], axis=1)

In [162]:
data_alt = encode_text_by_word_frequency(data, column='reviews')

In [163]:
data_alt.head(n=15)

Unnamed: 0,reviews,avg_score,overall,pos_word_frequency,neg_word_frequency
0,People are smart and friendly Bureaucracy is s...,4.6,5.0,0,0
1,"1) Food, food, food. 15+ cafes on main campus ...",3.2,5.0,10,5
2,"* If you're a software engineer, you're among ...",4.6,5.0,7,3
3,You can't find a more well-regarded company th...,4.2,4.0,12,4
4,Impact driven. Best tech in the world. Size ma...,5.0,4.0,0,0
5,The people are great to work with There is low...,,5.0,0,0
6,"The people are great to work with, good perks....",4.6,3.0,0,0
7,"Good managers, benefits , some support, nice a...",2.0,5.0,0,0
8,Really fun work environment with startup Reall...,5.0,5.0,1,0
9,Awesome job environment to work in Pressure is...,,5.0,0,0


In [164]:
useful_cols = ['avg_score', 'pos_word_frequency', 'neg_word_frequency']
train = data_alt[useful_cols]

In [165]:
y = data_alt['overall']

In [166]:
from sklearn.pipeline import Pipeline
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

In [167]:
X_train, x_test, y_train, y_test = train_test_split(train, y, test_size = 0.33, random_state=2)

In [168]:
pl = Pipeline([('imputer', Imputer()),
              ('scale', MaxAbsScaler()),
              ('clf', OneVsRestClassifier(AdaBoostClassifier()))])



In [169]:
pl.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scale', MaxAbsScaler(copy=True)), ('clf', OneVsRestClassifier(estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
          n_jobs=None))])

In [170]:
y_pred = pl.predict(x_test)

In [171]:
from sklearn.metrics import accuracy_score, fbeta_score, mean_squared_error
acc_y = accuracy_score(y_test, y_pred)
f1_y = fbeta_score(y_test, y_pred, beta=1, average='micro')
mse = mean_squared_error(y_test, y_pred)
print('accuracy', acc_y)
print('f1 score', f1_y)
print('MSE', mse)

accuracy 0.4092498251922885
f1 score 0.4092498251922885
MSE 0.9021076815502946


In [172]:
df_t = pd.read_csv('test.csv')
df_t.head()

Unnamed: 0,ID,Place,location,date,status,job_title,summary,positives,negatives,advice_to_mgmt,score_1,score_2,score_3,score_4,score_5,score_6
0,5,startup_1,"Los Angeles, CA","Jul 19, 2018",Former Employee,Software Engineer,"Unique, one of a kind dream job",Google is a world of its own. At every other c...,"If you don't work in MTV (HQ), you will be giv...",Promote managers into management for their man...,5,5,5,5,5,49
1,6,startup_1,"Mountain View, CA","Dec 9, 2018",Former Employee,SDE2,NICE working in GOOGLE as an INTERN,"People are not that busy, so they are nice to ...",Food is not good as I expected. People said it...,,4,4,4,5,4,1
2,7,startup_1,"New York, NY","Dec 11, 2018",Current Employee,Software Engineer,Software engineer,Great working environment. Good work life balance,Usual big company problems. Hierarchy.,,5,4,4,5,4,0
3,8,startup_1,,"Dec 11, 2018",Former Employee,Anonymous Employee,great place to work and progress,"work culture, benefits, growth, people,",No cons that i can think of,,5,5,5,5,5,0
4,9,startup_1,"New York, NY","Dec 10, 2018",Current Employee,Anonymous Employee,Google Surpasses Realistic Expectations,Great products. Vision you can feel good about...,Younger employees complaining about the compan...,,5,5,5,5,5,0


In [173]:
df_t['avg_score'] = df_t[score_cols].apply(np.mean, axis=1)

In [174]:
cols_t = ['positives', 'negatives', 'avg_score']
X_t = df_t[cols_t]

In [175]:
df_t['reviews'] = combine_text_cols(X_t)

In [176]:
cols_data_t = ['reviews', 'avg_score']
data_t = df_t[cols_data_t]

In [177]:
data_alt_t = encode_text_by_word_frequency(data_t, column='reviews')

In [178]:
test = data_alt_t[useful_cols]

In [179]:
test_y = pl.predict(test)

In [180]:
test_y

array([5., 4., 4., ..., 3., 3., 4.])

In [181]:
ID = df_t['ID']

In [182]:
dict_t = {'ID':ID,
         'overall':test_y}
df2 = pd.DataFrame(dict_t, columns=['ID', 'overall'])

In [183]:
df2.to_csv('last_chk.csv', index=False)