In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import re
from sklearn.metrics import confusion_matrix, f1_score, precision_score,\
recall_score, confusion_matrix, classification_report, accuracy_score 
import nltk
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import pyltr

In [3]:
def __GetLIWC(file:str): 
	liwc = pd.read_csv(file)
	liwc = liwc.rename(columns = {liwc.columns[2]:'user_id'})
	liwcUser = liwc.groupby('user_id').mean().reset_index()
	liwcUser = liwcUser.drop(['Source (A)', 'Source (D)'], axis=1)
	return liwcUser

def mergeFea(features, liwc, empath): 
	features = pd.read_csv(features)

	#merge features
	liwcUser = __GetLIWC(liwc)
	liwcUser2 = liwcUser.iloc[:,1::]
	liwcUser2.columns = [str(col) + '_liwc' for col in liwcUser2.columns]
	liwcUser2['user_id'] = liwcUser.user_id

	empath = pd.read_csv(empath)
	empath2 = empath.iloc[:,1::]
	empath2.columns = [str(col) + '_empath' for col in empath2.columns]
	empath2['user_id'] = empath.user_id

	allfea = pd.merge(features, liwcUser2, on = 'user_id', how = 'right')
	allfea = pd.merge(allfea, empath2, on = 'user_id', how = 'right')
	return allfea

def preprocess2(sent):
    #remove punctustion
    sent = re.sub(r'[^\w\s]','',str(sent))
    words = sent.split()
    new_words = []
    for w in words:      
        new_words.append(w.lower())
        
    return ' '.join(new_words)


class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key].values.reshape(-1,1)
    
class ItemSelectorText(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [4]:
path = '/home/lucia/phd_work/shareTask/'
#path = '/Users/lucia/phd_work/Clpsy/'
X = pd.read_csv(path + '/data/clpsych19_training_data/Btrain_NoNoise_SW.csv')
y = pd.read_csv(path + '/data/clpsych19_training_data/crowd_train.csv')

In [5]:
#concatenate text according to user id
X1 = X[['user_id','post_body']]
conTex = X1.groupby(['user_id'],as_index=False).agg(lambda x : x.sum() if str(x) else ' '.join(x))


In [6]:
conTex['post_body'] = conTex['post_body'].apply(lambda x: preprocess2(x))

In [7]:
Fea = pd.read_csv(path + '/suicideDetection/features/FreqSentiMotiTopiFea.csv')
Fea.columns

Index(['Unnamed: 0', 'user_id', 'raw_label', 'postingFrequency',
       'postingInterval', 'generalMoreFreq', 'generalWordCount',
       'healthPostingFrequency', 'healthPostingInterval', 'healthMoreFreq',
       'healthWordCount', 'mentionMethods', 'SWFrequency', 'SWPostingInterval',
       'SWFreq', 'SWWordCount', 'fin_body', 'drug_body', 'mental_body',
       'rela_body', 'suicide_body', 'hopeless_body', 'motivations',
       'family_senti', 'partner_senti', 'self_senti', 'raw_label.1',
       'sentiment', 'mclust'],
      dtype='object')

In [64]:
allFea = pd.read_csv(path + '/suicideDetection/features/FreqSentiMotiTopiFea.csv')
liwc = pd.read_csv(path + '/suicideDetection/features/liwcSW.csv')
tags = pd.read_csv(path + '/suicideDetection/features/TagFeaSW.csv')
empath = pd.read_csv(path + '/suicideDetection/features/empathSW.csv')
readability = pd.read_csv(path + '/suicideDetection/features/readability.csv')
embeddings = pd.read_csv(path + '/suicideDetection/features/Embeddings.csv')

liwc = liwc.iloc[:,np.r_[2, 8:liwc.shape[1]]]
liwc = liwc.rename(columns = {liwc.columns[0]:'user_id'})
liwcUser = liwc.groupby('user_id').mean().reset_index()
liwcUser.columns = [str(col) + '_liwc' for col in liwcUser.columns]
readability = readability.drop(['post_body'], axis = 1)

empath.columns = [str(col) + '_empath' for col in empath.columns]
tags.columns = [str(col) + '_tag' for col in tags.columns]
readability.columns = [str(col) + '_read' for col in readability.columns]
allFea = pd.merge(allFea, liwcUser, left_on ='user_id', right_on = 'user_id_liwc', how = 'left')
allFea = pd.merge(allFea, empath, left_on ='user_id', right_on = 'user_id_empath', how = 'left')
allFea = pd.merge(allFea, tags, left_on ='user_id', right_on = 'user_id_tag', how = 'left')
allFea = pd.merge(allFea, readability, left_on ='user_id', right_on = 'user_id_read', how = 'left')
allFea = pd.merge(allFea, embeddings, left_on ='user_id', right_on = 'user_id', how = 'left')

# # # #liwc
# # # allFea


In [65]:
#fea = pd.merge(y, conTex, on = 'user_id')
allFea2  = pd.merge(conTex, allFea, on = 'user_id')
#X = fea['post_body']
y = allFea2.raw_label

In [66]:
X_train, X_test, y_train, y_test = train_test_split(allFea2, y, test_size=0.30, random_state=35)

In [67]:
print(X_train.shape)
print(y_train.shape)

(347, 185)
(347,)


In [68]:
X_train.columns

Index(['user_id', 'post_body', 'Unnamed: 0_x', 'raw_label', 'postingFrequency',
       'postingInterval', 'generalMoreFreq', 'generalWordCount',
       'healthPostingFrequency', 'healthPostingInterval',
       ...
       'Unnamed: 0_read', 'user_id_read', 'readEase_read',
       'Flesch-Kincaid_read', 'gunning_fog_read', 'smog_index_read',
       'coleman_liau_read', 'linsear_write_read', 'Unnamed: 0_y', 'embedding'],
      dtype='object', length=185)

In [175]:
get_liwc_data = FunctionTransformer(lambda x: x[x.columns[x.columns.to_series().str.contains('liwc')]], validate=False)
get_empath_data = FunctionTransformer(lambda x: x[x.columns[x.columns.to_series().str.contains('empath')]], validate=False)
get_tags = FunctionTransformer(lambda x: x[x.columns[x.columns.to_series().str.contains('tag')]], validate=False)
get_read = FunctionTransformer(lambda x: x[x.columns[x.columns.to_series().str.contains('read')]], validate=False)


pipe1 = Pipeline([
    
    ('feats', FeatureUnion([
        ('text', Pipeline([
            ('selector', ItemSelectorText(key='post_body')),
            ('cv', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
             ])),
#
#         ('readability_features', Pipeline([
#                 ('selector', get_read)])),
        ('liwc_features', get_liwc_data),
        ('empath_features', get_empath_data),
        ('tag_features', get_tags),
        ('selector1', ItemSelector(key='motivations')),
#         ('selector2', ItemSelector(key='healthPostingFrequency')),
#         ('selector3', ItemSelector(key='healthPostingInterval')),
#         ('selector4', ItemSelector(key='healthMoreFreq')),        
        ('selector5', ItemSelector(key='healthWordCount')),
        ('selector6', ItemSelector(key='mentionMethods')),
        
#         ('selector7', ItemSelector(key='SWFrequency')),
#         ('selector8', ItemSelector(key='SWPostingInterval')),
#         ('selector9', ItemSelector(key='SWFreq')),
#         ('selector10', ItemSelector(key='suicide_body')),
#         ('selector11', ItemSelector(key='hopeless_body')),
#         ('selector12', ItemSelector(key='self_senti')),
#         ('selector13', ItemSelector(key='mclust')),
#         ('selector14', ItemSelector(key='family_senti')),
#       ('selector15', ItemSelector(key='embedding')),
             ])),
    
       ('clf', Pipeline([
       ('scale', StandardScaler(with_mean=False)),
       ('feature_selection', SelectFromModel(RandomForestClassifier(random_state=20))),
#        ('svm',  svm.SVC())
       ('log', LogisticRegression()),
#      ('gbn', GaussianNB()),

         ])),
])

# parameters = [{
#             'estimator__clf__svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'estimator__clf__svc__gamma': [0.01, 0.001, 0.0001],
#             'estimator__clf__svc__C':[0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.5, 2.0, 10] , 'estimator__clf__svc__class_weight':['balanced']}]
# parameters = [{'estimator__clf__feature_selection__estimator__max_depth': [5,10,20], 'estimator__clf__feature_selection__estimator__max_leaf_nodes': [50, 100, 200],
#                'estimator__clf__log__C':[1.0, 2.0, 3.0], 'estimator__clf__log__class_weight': ['balanced'], 'estimator__clf__log__multi_class': ['ovr', 'multinomial']}]

parameters = [{'estimator__clf__log__C':[1.0, 2.0, 3.0]}]

grid_search_item = GridSearchCV(pipe1, parameters, cv = 5, scoring='accuracy')

grid_search = grid_search_item.fit(X_train, y_train)
# pred = pipe1.predict(X_test)
# print(classification_report(y_test, pred))

ValueError: Invalid parameter estimator for estimator Pipeline(memory=None,
     steps=[('feats', FeatureUnion(n_jobs=None,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('selector', ItemSelectorText(key='post_body')), ('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='c...alty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [172]:
for para in grid_search_item.get_params().keys():
    print(para)

cv
error_score
estimator__memory
estimator__steps
estimator__feats
estimator__clf
estimator__feats__n_jobs
estimator__feats__transformer_list
estimator__feats__transformer_weights
estimator__feats__text
estimator__feats__liwc_features
estimator__feats__empath_features
estimator__feats__tag_features
estimator__feats__selector1
estimator__feats__selector5
estimator__feats__selector6
estimator__feats__text__memory
estimator__feats__text__steps
estimator__feats__text__selector
estimator__feats__text__cv
estimator__feats__text__tfidf
estimator__feats__text__selector__key
estimator__feats__text__cv__analyzer
estimator__feats__text__cv__binary
estimator__feats__text__cv__decode_error
estimator__feats__text__cv__dtype
estimator__feats__text__cv__encoding
estimator__feats__text__cv__input
estimator__feats__text__cv__lowercase
estimator__feats__text__cv__max_df
estimator__feats__text__cv__max_features
estimator__feats__text__cv__min_df
estimator__feats__text__cv__ngram_range
estimator__feats__te

learning to rank

In [72]:
from sklearn.linear_model import LinearRegression
from math import sin
import numpy as np
import csv

In [None]:
butIRegress = LinearRegression()

In [99]:
pipe2 = Pipeline([
    
    ('feats', FeatureUnion([
        ('text', Pipeline([
            ('selector', ItemSelectorText(key='post_body')),
            ('cv', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
             ])),
        ('liwc_features', Pipeline([
                ('selector', get_liwc_data) ])),
        ('empath_features', Pipeline([
                ('selector', get_empath_data)])),
        ('tag_features', Pipeline([
                ('selector', get_tags)])),
#         ('readability_features', Pipeline([
#                 ('selector', get_read)])),
        ('selector1', ItemSelector(key='motivations')),

        ('selector5', ItemSelector(key='healthWordCount')),
        ('selector6', ItemSelector(key='mentionMethods')),
             ])),
    
       ('clf', Pipeline([
       ('scale', StandardScaler(with_mean=False)),
       ('feature_selection', SelectFromModel(RandomForestClassifier(random_state=20))),

      ('log', LogisticRegression()),
#      ('gbn', GaussianNB()),

         ])),
])
parameters = [{'feature_selection__estimator__max_depth': [5,10,20], 'feature_selection__estimator__max_leaf_nodes': [50, 100, 200]}]

    
estimator = GridSearchCV(pipe2, parameters, scoring='roc_auc')
estimator.fit(X_train, y_train)

# pred = pipe1.predict(X_test)
# print(classification_report(y_test, pred))



Pipeline(memory=None,
     steps=[('feats', FeatureUnion(n_jobs=None,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('selector', ItemSelectorText(key='post_body')), ('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='c...alty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]))])

In [116]:
pipe2.best_estimator_.named_steps['clf']

AttributeError: 'Pipeline' object has no attribute 'best_estimator_'

In [None]:
÷