In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
X_data = pd.read_csv('features_DEMO.csv')
y_data = pd.read_csv('train.csv')["toxic"]

print(X_data.shape) # important in colab, it will take partial data even if it is not fully read in
print(y_data.shape)
y_data.head()

(159571, 84)
(159571,)


Unnamed: 0,toxic
0,0
1,0
2,0
3,0
4,0


In [4]:
# from sklearn import preprocessing
# X_data.fillna(0, inplace=True)
# X_data_norm = preprocessing.normalize(X_data)

In [7]:
# old code, built later stuff off of this

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    # ('lda', LinearDiscriminantAnalysis()), # worse with this
    ('classification', LogisticRegression())
])

param_grid = {
    'pca__n_components': [n for n in range(66, 72)],
    # 'lda__solver': ['svd'], # others did not seem to make a difference
    # 'lda__n_components': [1]
}

# used precision to focus on negatives
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='recall', verbose=1, n_jobs=-1)
grid_search.fit(X_data, y_data)

print("Best parameters:", grid_search.best_params_)

# sort the results to display the best params/models
results = [
    {"params": grid_search.cv_results_["params"][i], "score": grid_search.cv_results_["mean_test_score"][i]}
    for i in range(len(grid_search.cv_results_["params"]))
]
sorted_results = sorted(results, key=lambda x: x["score"], reverse=True)

for entry in sorted_results:
    print(entry)

# 0.81585 at 68 PCA

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters: {'pca__n_components': 71}
{'params': {'pca__n_components': 71}, 'score': np.float64(0.5528966514446323)}
{'params': {'pca__n_components': 70}, 'score': np.float64(0.5528966300643695)}
{'params': {'pca__n_components': 69}, 'score': np.float64(0.5527004875341309)}
{'params': {'pca__n_components': 67}, 'score': np.float64(0.5520464866776376)}
{'params': {'pca__n_components': 68}, 'score': np.float64(0.5518504510487126)}
{'params': {'pca__n_components': 66}, 'score': np.float64(0.5517851343460879)}


In [26]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # performed worse with this in the pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

class PCA_finder:
  def __init__(self, X, y, classifier=LogisticRegression(), scoring_="recall"):
    # find values on construction. Save with method
    self.X = X
    self.y = y
    self.model, self.n_components, _ = self.find_best_pca_components(classifier, scoring=scoring_)
    self.trans_X = self.model.named_steps["pca"].transform(self.X)

  ### Use cross validation on a pca pipeline, testing values of n_components until
  ### improvements are bellow threshold self.tol.
  def find_best_pca_components(self,
                               classifier, tol=0.001,
                               X=None, y=None,
                               early_stopping=True,
                               scoring="recall"):
    if X is None:
      X = np.array(self.X)
    if y is None:
      y = self.y

    # logging
    logs = []

    # tracking
    best_score = -np.inf
    best_n = 0
    recommend_score = -np.inf
    recommend_n = -1
    model = None

    for n in range(1, self.X.shape[1]):
      model = Pipeline([
          ("scaler", StandardScaler()),
          ("pca", PCA(n_components=n)),
          ("model", classifier)
      ])
      score = np.mean(cross_val_score(model, self.X, self.y, cv=5, scoring=self.scoring))
      model.fit(self.X, self.y)

      # logs
      logs.append([model, n, score])

      # track best score
      if score > best_score:
          best_score = score
          best_n = n

      # track recommended
      if n>1 and abs(logs[n-1][2]-logs[n-2][2]) < tol:
          recommend_n = n
          recommend_score = score
          if early_stopping:
            break

    if not early_stopping:
      print(f"Best number of components: {best_n} with {self.scoring} score: {best_score}")
    print(f"Recommended number of components: {recommend_n} with {self.scoring} score: {best_score}")
    return logs[recommend_n-1]

  def report_scores(self, X=None, y=None):
    if X is None:
      X = self.X
    if y is None:
      y = self.y

    y_pred = self.model.predict(X)
    f1 = f1_score(y, y_pred, average="macro")
    print(f"f1_macro : {f1:.4}")
    accuracy = accuracy_score(y, y_pred)
    print(f"accuracy : {accuracy:.4}")
    precision = precision_score(y, y_pred)
    print(f"precision: {precision:.4}")
    recall = recall_score(y, y_pred)
    print(f"recall   : {recall:.4}")

  def get_model(self):
    return self.model

  def predict(self, X):
    return self.model.predict(X)

  def transform_data(self, X):
    self.trans_X = self.model.named_steps["pca"].transform(X)
    return self.trans_X

  def save_transformed_data(self, file_name="trans_X.csv"):
    if self.trans_X:
      np.savetxt(file_name, self.trans_X, delimiter=",")

  def save_model(self, file_name="model.pkl"):
    import pickle
    with open(file_name, "wb") as f:
      pickle.dump(self.model, f)


In [29]:
pca_finder = PCA_finder(X_data, y_data, scoring_="recall")
pca_finder.report_scores()

Recommended number of components: 11 with precision score: 0.7849373886220191




f1_macro : 0.7867
accuracy : 0.9386
precision: 0.7861
recall   : 0.4939


In [28]:
# sns.lineplot(x=components, y=scores)
# plt.show()
pca_finder.report_scores()

f1_macro : 0.7869
accuracy : 0.9387
precision: 0.7861
recall   : 0.4945
