In [21]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.feature_selection import SelectKBest, chi2


### Create vectors

In [25]:
df = pd.read_json('../data/splitData/postdataLinesSplit.json',lines=True)
labels = df['label']

posts = []

for row in df['text']:
    post = row[0].lower()
    posts.append(post)

vectorizer = TfidfVectorizer(stop_words='english',lowercase=True)
vectors = vectorizer.fit_transform(posts)

# select top 500
selector = SelectKBest(chi2, k=500)
top500Vectors = selector.fit_transform(vectors, labels)
vectors = top500Vectors.todense()

vector_dicts = []
for vector in vectors:
    vector_dict = {i: value for i, value in enumerate(vector.tolist()[0])}
    vector_dicts.append(vector_dict)

vector_df = pd.DataFrame(vector_dicts)
df = pd.concat([df, vector_df], axis=1)
df.to_json("../data/vectorData/TFIDF500Vectors.json", orient='records', lines=True)



### Train model

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np


In [29]:
df = pd.read_json('../data/vectorData/TFIDF500Vectors.json', orient='records', lines=True)

(df['gender'] == 'm') & (df['split'] == 'train')

X_train = df.loc[(df['split'] == 'train'), [str(i) for i in range(500)]]
X_test = df.loc[(df['split'] == 'test'), [str(i) for i in range(500)]]
# X_testm = df.loc[(df['gender'] == 'm') & (df['split'] == 'test'), [str(i) for i in range(712)]]
# X_testf = df.loc[(df['gender'] == 'f') & (df['split'] == 'test'), [str(i) for i in range(712)]]

y_train = df.loc[(df['split'] == 'train'), ['label']]
y_test = df.loc[(df['split'] == 'test'), ['label']]
# y_testm = df.loc[(df['gender'] == 'm') & (df['split'] == 'test'), ['label']]
# y_testf = df.loc[(df['gender'] == 'f') & (df['split'] == 'test'), ['label']]

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
# X_testm = scaler.transform(X_testm)
# X_testf = scaler.transform(X_testf)

y_train = y_train.values.ravel()

param_grid = {'C': np.logspace(-3, 3, 7),  
              'gamma': np.logspace(-3, 3, 7), 
              'kernel': ['rbf']}  

cv = 5
scoring = 'accuracy'

grid = GridSearchCV(SVC(random_state=99, probability=True, class_weight='balanced'), param_grid, scoring=scoring, cv=cv, n_jobs=-1, verbose=1)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 49 candidates, totalling 245 fits


### Get predictions

In [30]:
y_pred = grid.predict(X_test)

testSet = df.loc[(df['split'] == 'test')]

testSet['prediction'] = y_pred

testSet.to_json('../data/predictionData/TFIDF500Pred.json',orient='records',lines=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testSet['prediction'] = y_pred
