In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

### Create vectors

In [3]:
df = pd.read_json('../data/splitData/postdataLinesSplit.json',lines=True)

keywordsJson = pd.read_json('../data/depression_synonyms.json', orient='records')
keywords = list(keywordsJson['depression'])

posts = []

for row in df['text']:
    post = row[0].lower()
    posts.append(post)

vectorizer = TfidfVectorizer(stop_words=keywords,lowercase=True)
vectors = vectorizer.fit_transform(posts)
vectors = vectors.todense()
vector_dicts = []
for vector in vectors:
    vector_dict = {i: value for i, value in enumerate(vector.tolist()[0])}
    vector_dicts.append(vector_dict)

vector_df = pd.DataFrame(vector_dicts)
df = pd.concat([df, vector_df], axis=1)
df.to_json("../data/vectorData/TFIDFEXVectors.json", orient='records', lines=True)



### Train model

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np


In [8]:
df = pd.read_json('../data/vectorData/TFIDFEXVectors.json', orient='records', lines=True)

(df['gender'] == 'm') & (df['split'] == 'train')

X_train = df.loc[(df['split'] == 'train'), [str(i) for i in range(21621)]]
X_test = df.loc[(df['split'] == 'test'), [str(i) for i in range(21621)]]
# X_testm = df.loc[(df['gender'] == 'm') & (df['split'] == 'test'), [str(i) for i in range(712)]]
# X_testf = df.loc[(df['gender'] == 'f') & (df['split'] == 'test'), [str(i) for i in range(712)]]

y_train = df.loc[(df['split'] == 'train'), ['label']]
y_test = df.loc[(df['split'] == 'test'), ['label']]
# y_testm = df.loc[(df['gender'] == 'm') & (df['split'] == 'test'), ['label']]
# y_testf = df.loc[(df['gender'] == 'f') & (df['split'] == 'test'), ['label']]

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
# X_testm = scaler.transform(X_testm)
# X_testf = scaler.transform(X_testf)

y_train = y_train.values.ravel()

param_grid = {'C': np.logspace(-3, 3, 7),  
              'gamma': np.logspace(-3, 3, 7), 
              'kernel': ['rbf']}  

cv = 5
scoring = 'accuracy'

grid = GridSearchCV(SVC(random_state=99, probability=True, class_weight='balanced'), param_grid, scoring=scoring, cv=cv, n_jobs=-1, verbose=1)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 49 candidates, totalling 245 fits


### Get predictions

In [41]:
y_pred = grid.predict(X_test)

testSet = df.loc[(df['split'] == 'test')]

testSet['prediction'] = y_pred

testSet.to_json('../data/predictionData/TFIDFEXPred.json',orient='records',lines=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testSet['prediction'] = y_pred
