In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.feature_selection import SelectKBest, chi2


### Create vectors and top 500 word ranking

In [38]:
df = pd.read_json('../data/splitData/postdataLinesSplit.json',lines=True)
dfm = df.loc[(df['gender'] == 'm')]
dff = df.loc[(df['gender'] == 'f')]
labels = df['label']
labelsm = dfm['label']
labelsf = dff['label']

posts = []
postsm = []
postsf = []

for row in df['text']:
    post = row[0].lower()
    posts.append(post)

for row in dfm['text']:
    post = row[0].lower()
    postsm.append(post)

for row in dff['text']:
    post = row[0].lower()
    postsf.append(post)



vectorizer = TfidfVectorizer(stop_words='english',lowercase=True)
selector = SelectKBest(chi2, k=500)

# select top 500
# total
vectors = vectorizer.fit_transform(posts)
top500Vectors = selector.fit_transform(vectors, labels)

mask = selector.get_support()
features = vectorizer.get_feature_names_out()[mask]
allScores = selector.scores_
scores = allScores[mask]

# male
vectorsm = vectorizer.fit_transform(postsm)
top500Vectorsm = selector.fit_transform(vectorsm, labelsm)


maskm = selector.get_support()
featuresm = vectorizer.get_feature_names_out()[maskm]
allScoresm = selector.scores_
scoresm = allScoresm[maskm]

# female
vectorsf = vectorizer.fit_transform(postsf)
top500Vectorsf = selector.fit_transform(vectorsf, labelsf)


maskf = selector.get_support()
featuresf = vectorizer.get_feature_names_out()[maskf]
allScoresf = selector.scores_
scoresf = allScoresf[maskf]

top500Words = pd.DataFrame({'features':features,'scores':scores}).sort_values(by='scores', ascending=False)
top500Wordsm = pd.DataFrame({'features':featuresm,'scores':scoresm}).sort_values(by='scores', ascending=False)
top500Wordsf = pd.DataFrame({'features':featuresf,'scores':scoresf}).sort_values(by='scores', ascending=False)

totaltable = pd.concat([top500Words,top500Wordsm,top500Wordsf], axis=1)
print(totaltable.head(20))

print(totaltable.head(20).reset_index(drop=True).to_latex(caption='Top 20 TFIDF words',index=True))

# create vectors for SVM model
vectors = top500Vectors.todense()
vector_dicts = []
for vector in vectors:
    vector_dict = {i: value for i, value in enumerate(vector.tolist()[0])}
    vector_dicts.append(vector_dict)

vector_df = pd.DataFrame(vector_dicts)
df = pd.concat([df, vector_df], axis=1)
df.to_json("../data/vectorData/TFIDF500Vectors.json", orient='records', lines=True)

       features     scores    features    scores     features     scores
118    dialogue  25.097472   different  0.667628   depression  11.672804
113  depression  19.012485  depressive  0.563564       deeper   0.531270
17      anxiety  11.304031      animal  0.365658      anxiety   6.963480
416     student  10.044010          su  0.314968        story   0.780988
2            __   8.711688        2016  0.408216           __   5.592769
417    students   8.507423       sucks  0.666282       stress   0.500953
111   depressed   8.192506  depressing  0.407421       debate   1.002802
157        feel   6.920789      forces  0.303131      eyebrow   0.553865
158     feeling   6.909871      forums  0.331704      eyelash   0.632751
190       group   6.498619      hinges  0.319598         gone   0.501632
241        know   5.928631        long  0.609346         joey   0.593367
283  medication   5.409079      number  0.371076  medications   1.375825
475          ve   5.376466      versus  0.374553   

### Train model

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np


In [3]:
df = pd.read_json('../data/vectorData/TFIDF500Vectors.json', orient='records', lines=True)

(df['gender'] == 'm') & (df['split'] == 'train')

X_train = df.loc[(df['split'] == 'train'), [str(i) for i in range(500)]]
X_test = df.loc[(df['split'] == 'test'), [str(i) for i in range(500)]]
# X_testm = df.loc[(df['gender'] == 'm') & (df['split'] == 'test'), [str(i) for i in range(712)]]
# X_testf = df.loc[(df['gender'] == 'f') & (df['split'] == 'test'), [str(i) for i in range(712)]]

y_train = df.loc[(df['split'] == 'train'), ['label']]
y_test = df.loc[(df['split'] == 'test'), ['label']]
# y_testm = df.loc[(df['gender'] == 'm') & (df['split'] == 'test'), ['label']]
# y_testf = df.loc[(df['gender'] == 'f') & (df['split'] == 'test'), ['label']]

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
# X_testm = scaler.transform(X_testm)
# X_testf = scaler.transform(X_testf)

y_train = y_train.values.ravel()

param_grid = {'C': np.logspace(-3, 3, 7),  
              'gamma': np.logspace(-3, 3, 7), 
              'kernel': ['rbf']}  

cv = 5
scoring = 'accuracy'

grid = GridSearchCV(SVC(random_state=99, probability=True, class_weight='balanced'), param_grid, scoring=scoring, cv=cv, n_jobs=-1, verbose=1)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 49 candidates, totalling 245 fits


### Get predictions

In [4]:
y_pred = grid.predict(X_test)
y_prob = grid.predict_proba(X_test)[:,1]

testSet = df.loc[(df['split'] == 'test')]

testSet['prediction'] = y_pred
testSet['probability'] = y_prob

testSet.to_json('../data/predictionData/TFIDF500Pred.json',orient='records',lines=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testSet['prediction'] = y_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testSet['probability'] = y_prob
