<a href="https://colab.research.google.com/github/ChristopherCrook/50_Assignment5_Crook/blob/main/HonestyNotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project Overview
This project will attempt to predict a correlation between a person's religion and whether they display personality traits indicitive of Machiavellian tendencies. Included in this prediction will be whether the person considers themselves more introverted or extroverted.

## Data Set
Machivallianism Test on Kaggle

## Performance Measures
This will be based on a percentage of people with more honest traits and what their religious preference is along with whether they identify as being more reserved or extroverted.

# Feature Engineering

In [None]:
#tables and visualizations
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler, OrdinalEncoder
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import config_context
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn import svm
from sklearn.model_selection import cross_val_score

In [None]:
data = pd.read_csv('https://github.com/vanderbilt-ml/50-Crook-mlproj-Honesty/blob/main/data.csv?raw=true', delimiter='\t')
data.info()

# Compute Scores, Delete Columns and Add Scores columns
## We don't necessarily care about the questions themselves, but we do care about the total score of all questions, so we need to calculate them. Values should be between 20-100.

In [None]:
# Declare the variables we need
scores = []
current_score = 0
current_column = ""

prefix = "Q"
a_suffix = "A"
i_suffix = "I"
e_suffix = "E"

# Begin our loop by iterating through the rows
for i in range(len(data)) :
  # Zeroize the current_score
  current_score = 0
  # Add the answers to questions in each row
  for x in range(20):
    current_column = prefix + str(x+1) + a_suffix
    current_score = current_score + data.loc[i, current_column]
  # Add the score to the list
  scores.append(current_score)

# Print, just so we know everything went okay (Uncomment only if needed)
#for i in range(len(scores)) :
#  print(scores[i])

# Check our scores to ensure that we don't have any crazy values
outliers = 0
for i in range(len(scores)) :
  if (i < 20 or i > 100) :
    outliers = outliers + 1

print("Outliers: " + str(outliers))

In [None]:
# Now we can remove the columns we don't need
current_a = ""
current_i = ""
current_e = ""

for i in range(20) :
  current_a = prefix + str(i+1) + a_suffix
  current_i = prefix + str(i+1) + i_suffix
  current_e = prefix + str(i+1) + e_suffix
  #print(current_a + " " + current_i + " " + current_e)
  data.drop([current_a], inplace=True, axis=1)
  data.drop([current_i], inplace=True, axis=1)
  data.drop([current_e], inplace=True, axis=1)

# Now add the Scores column
data['Score'] = scores

In [None]:
# Print the data after the conversion
print(data)

In [None]:
# Check for missing values
data.isnull().sum()

In [None]:
# Let's look at a graph of the data first
sns.pairplot(data, hue='religion')
plt.show()

# Split the Data

In [None]:
class_column = 'religion'
random_seed = 2435

X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=class_column), data[class_column],
                                                   test_size=0.2, random_state=random_seed, stratify=data[class_column])

Sanity Check

In [None]:
# X Train
print('On X train: ')
print('X train dimensions: ', X_train.shape)
display(X_train.head())

# X test
print('\nOn X test: ')
print('X test dimensions: ', X_test.shape)
display(X_test.head())

Create Pipelines

In [None]:
#individual pipelines for differing datatypes
cat_pipeline = Pipeline(steps=[('cat_impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                               ('onehot_cat', OneHotEncoder(drop='if_binary'))])
num_pipeline = Pipeline(steps=[('impute_num', SimpleImputer(missing_values=np.nan, strategy='mean')),
                               ('scale_num', StandardScaler())])

In [None]:
#establish preprocessing pipeline by columns
preproc = ColumnTransformer([('cat_pipe', cat_pipeline, make_column_selector(dtype_include=object)),
                             ('num_pipe', num_pipeline, make_column_selector(dtype_include=np.number))],
                             remainder='passthrough')

In [None]:
#generate the whole modeling pipeline with preprocessing
LRpipe = Pipeline(steps=[('preproc', preproc),
                       ('mdl', LogisticRegression(penalty='elasticnet', solver='saga', tol=0.01))])

#visualization for steps
with config_context(display='diagram'):
    display(LRpipe)

# Assignment 5
## Explore 3 different models in your ML pipeline for your personal project

In [None]:
# Set up Random Forest Pipeline
randomForest_pipe = Pipeline(steps=[('preproc', preproc),
                       ('mdl', RandomForestClassifier())])

#visualization for steps
with config_context(display='diagram'):
    display(randomForest_pipe)

# Set up Naive Bayes classifier for multinomial models
naiveBayes_pipe = Pipeline(steps=[('preproc', preproc),
                       ('mdl', MultinomialNB())])

#visualization for steps
with config_context(display='diagram'):
    display(naiveBayes_pipe)

# Cross Validation with Hyperparameter Tuning

In [None]:
# Set up tuning grids
logisticRegression_tuning_grid = {'mdl__l1_ratio' : np.linspace(0,1,5),
               'mdl__C': np.logspace(-1, 6, 3) }

randomForest_tuning_grid = {'mdl__n_estimators' : [100, 200 ,500],
               'mdl__max_depth': [10, 15, 20] }

NB_tuning_grid = RepeatedStratifiedKFold(n_splits=5,  n_repeats=3, random_state=999)

In [None]:
# fit the models
#logisticRegression_grid_search = GridSearchCV(LRpipe, param_grid = logisticRegression_tuning_grid, cv = 5, return_train_score=True)
#logisticRegression_grid_search.fit(X_train, y_train)

# Now let's do the Random Forest Classifier
#randomForest_grid_search = GridSearchCV(randomForest_pipe, param_grid = randomForest_tuning_grid, cv = 5, return_train_score=True)
#randomForest_grid_search.fit(X_train, y_train)

# Now let's do the Random Forest Classifier
#NB_grid_search = GridSearchCV(naiveBayes_pipe, param_grid = NB_tuning_grid, cv = 5, return_train_score=True)
#NB_grid_search.fit(X_train, y_train)