In [None]:
"""
Steps for this project
1. dataset
2. date preprocessing 
a. add new columns (I/E, I/S, F/T, P/J) 
b. removal of unesscary data point
b1. url links removed
b2. remove stopwords
b3. lemmatization
b4. tokenization using keras word tokenizer
3. data training
4. classfication [tensorflow]

using the recurrent neural networks and support vector machine
    
"""

In [1]:
import pandas as pd
import re

# removal of stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

# Count Vectorizer
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

# Machine Learning Models 
from sklearn.svm import SVC



[nltk_data] Downloading package wordnet to /Users/sanzi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# open csv file
myersBrigs_df = pd.read_csv("/Users/sanzi/Desktop/school/personality dating app/mbti_1.csv")

In [3]:
myersBrigs_df

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...


## Data Preprocessing 

In [4]:
# Add 4 new columns for the following
# First Column: Introvert / Extrovert 
# Second Column: Intution / Sensation
# Third Column: Thinking / Feeling
# Fourth Column: Perception / Judgement 

# Create a matrix column that classifiers how many 1 or 0 there are for each columns
def PersonailityColumns_helper(matrixList, columnIndiciator, personailityList, matrixIndicator):
    
    index = 0
    # Loop through each value in the personaliltyList 
    for word in personailityList:
        # For each character in the personaility type word
        if word[matrixIndicator] == columnIndiciator:
            matrixList[index] = 1
        # else if the character does not match the one we are looking for set it as 0
        else:
            matrixList[index] = 0

        # increment index by 1
        index += 1
    # return the matrix list
    return matrixList
        

# Add the personailty column to the dataframe
def PersonailtyColumns(dataframe):
    
    E_I = np.zeros(myersBrigs_df.shape[0])
    I_S = np.zeros(myersBrigs_df.shape[0])
    T_F = np.zeros(myersBrigs_df.shape[0])
    P_J = np.zeros(myersBrigs_df.shape[0])
    
    personailityType_list = list(myersBrigs_df["type"])
    
    E_I = PersonailityColumns_helper(E_I, "E", personailityType_list, 0)
    I_S = PersonailityColumns_helper(I_S, "S", personailityType_list, 1)
    T_F = PersonailityColumns_helper(T_F, "F", personailityType_list, 2)
    P_J = PersonailityColumns_helper(P_J, "J", personailityType_list, 3)
    
    # If they are an extrovert value in this column is a 1
    myersBrigs_df.insert(2, "Extrovert/Introvert", E_I, True)
    
    # If they are an sensational value in this column is a 1
    myersBrigs_df.insert(3, "Intution/Sensation", I_S, True)

    # If they are an feeling value in this column is a 1
    myersBrigs_df.insert(4, "Thinking/Feeling", T_F, True)
    
    # If they are an judgement value in this column is a 1
    myersBrigs_df.insert(5, "Perception/Judgement", P_J, True)

# Add these columns to the dataframe
PersonailtyColumns(myersBrigs_df)

In [5]:
myersBrigs_df

Unnamed: 0,type,posts,Extrovert/Introvert,Intution/Sensation,Thinking/Feeling,Perception/Judgement
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0.0,0.0,1.0,1.0
1,ENTP,'I'm finding the lack of me in these posts ver...,1.0,0.0,0.0,0.0
2,INTP,'Good one _____ https://www.youtube.com/wat...,0.0,0.0,0.0,0.0
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",0.0,0.0,0.0,1.0
4,ENTJ,'You're fired.|||That's another silly misconce...,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,0.0,1.0,1.0,0.0
8671,ENFP,'So...if this thread already exists someplace ...,1.0,0.0,1.0,0.0
8672,INTP,'So many questions when i do these things. I ...,0.0,0.0,0.0,0.0
8673,INFP,'I am very conflicted right now when it comes ...,0.0,0.0,1.0,0.0


In [6]:
myersBrigs_df["processed posts"] = myersBrigs_df["posts"].str.lower()

In [7]:
"""
remove unneccasry infomation from the posts at each index:
  - stopwords
  - url links
  - special characters and numbers
  - user that have the 16 personlity type in their text
  - spaces
"""
def RemovalUrl():
  
  # all the patterns we would like to find and remove
  pattern1 = re.compile(r' http://\S+|https://\S+')
  pattern2 = re.compile(r'http[s]?://\S+')
  pattern3 = re.compile(r"http\S+")
  
  # loop through the dataframe by getting the shape of the dataframe, the number or rows and columns
  for i in range(myersBrigs_df.shape[0]):
    
    # get the post text at the selected index
    text = myersBrigs_df._get_value(i, 'processed posts')
    #print(pre_text)
    
    # find the string that matches the pattern and replace with an empty string
    post_text = re.sub(pattern1, ' ', text)
    post_text = re.sub(pattern2, ' ', text)
    post_text = re.sub(pattern3, ' ', text)
    
    # set the post text to the processed post column
    myersBrigs_df._set_value(i, 'processed posts', post_text)

In [8]:
def RemvoalSpecial():
  pattern3 = re.compile('\W+')
  pattern4 = re.compile(r'[0-9]')
  pattern5 = re.compile(r'[_+]')
  pattern6 = re.compile('\s+')
  
  # loop through the dataframe by getting the shape of the dataframe, the number or rows and columns
  for i in range(myersBrigs_df.shape[0]):
    
    # get the post text at the selected index
    pre_text2 = myersBrigs_df._get_value(i, 'processed posts')
    
    # find the string that matches the pattern and replace with an empty string
    post_text2 = re.sub(pattern3, ' ', pre_text2)
    post_text2 = re.sub(pattern4, ' ', post_text2)
    post_text2 = re.sub(pattern5, ' ', post_text2)
    post_text2 = re.sub(pattern6, ' ', post_text2)
    
    # set the post text to the processed post column
    myersBrigs_df._set_value(i, 'processed posts', post_text2)

In [9]:
def RemovalStopWords():
  stopwordsList = stopwords.words('english')
  # removal of the stop words
  for i in range(myersBrigs_df.shape[0]):
    post_text_with_stopwords = myersBrigs_df._get_value(i, 'processed posts')
    post_text_without_stopwords = " ".join([w for w in post_text_with_stopwords.split(' ') if w not in stopwordsList])
    myersBrigs_df._set_value(i, "processed posts", post_text_without_stopwords)

In [10]:
def RemovalPersonalilty():
  pattern7 = ['infp', 'infj', 'intp', 'intj', 
              'entp', 'enfp', 'istp', 'isfp',
              'entj', 'istj', 'enfj', 'isfj',
              'estp', 'esfp', 'esfj', 'estj']
  
  # removal of the 16 personality types from the texts
  for i in range(myersBrigs_df.shape[0]):
    text = myersBrigs_df._get_value(i, "processed posts")
    updated_text = " ".join([words for words in text.split(' ') if words not in pattern7])
    myersBrigs_df._set_value(i, "processed posts", updated_text)

In [11]:
def Lemmatization():
  for i in range(myersBrigs_df.shape[0]):
    text = myersBrigs_df._get_value(i, 'processed posts')
    lemmatized_text = " ".join([lemmatizer.lemmatize(w) for w in text.split(' ')])
    myersBrigs_df._set_value(i, 'processed posts', lemmatized_text)

In [12]:
def dataPreprocessing():
    RemovalUrl()
    RemvoalSpecial()
    RemovalStopWords()
    RemovalPersonalilty()
    Lemmatization()
    

In [13]:
dataPreprocessing()

In [14]:
myersBrigs_df

Unnamed: 0,type,posts,Extrovert/Introvert,Intution/Sensation,Thinking/Feeling,Perception/Judgement,processed posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0.0,0.0,1.0,1.0,moment sportscenter top ten play prank life c...
1,ENTP,'I'm finding the lack of me in these posts ver...,1.0,0.0,0.0,0.0,finding lack post alarming sex boring positio...
2,INTP,'Good one _____ https://www.youtube.com/wat...,0.0,0.0,0.0,0.0,good one course say know blessing curse absol...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",0.0,0.0,0.0,1.0,dear enjoyed conversation day esoteric gabbin...
4,ENTJ,'You're fired.|||That's another silly misconce...,1.0,0.0,0.0,1.0,fired another silly misconception approaching...
...,...,...,...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,0.0,1.0,1.0,0.0,always think cat fi doms reason website becom...
8671,ENFP,'So...if this thread already exists someplace ...,1.0,0.0,1.0,0.0,thread already exists someplace else heck del...
8672,INTP,'So many questions when i do these things. I ...,0.0,0.0,0.0,0.0,many question thing would take purple pill pi...
8673,INFP,'I am very conflicted right now when it comes ...,0.0,0.0,1.0,0.0,conflicted right come wanting child honestly ...


## Feature Selection

In [15]:
# Using CountVectorizer we are converting the posts into a matrix 

def setProcessList():
    # list holding the verctorized processed posts
    processedPosts_list = []

    # loop through every item for processed posts and append to the list
    # i is the index value of the table and j is the processed posts text
    for i, j in myersBrigs_df["processed posts"].iteritems():
        
        #print("The value of I is: ")
        #print(i)
        
        #print("The value of J is: ")
        #print(j)
        
        # append the processed list text in the list
        processedPosts_list.append(j)
        
    return processedPosts_list

In [74]:
def vectorization(input):
    # Take the processed post list and convert it to  count vector matrix 
    vector = CountVectorizer(stop_words="english", max_features = 1500)
    vector_features = vector.fit_transform(input)
    #print(vector)
    #print(vector_features)

    # incorporate tf-idf to add weight to the importat words in all processed posts
    tf = TfidfTransformer()
    tf_vector = tf.fit_transform(vector_features).toarray()
    
    return tf_vector

In [None]:
list_input = setProcessList()
tf_vector = vectorization(list_input)

In [75]:
# rename the tf_vectorized of the processed words into a variable called X_Dataa
X_data = tf_vector
#print(X_data)

# Classify what the Y_Data would be which is the processed_posts, introvert/extrovert, intution/sensation, thinking/feeling, perception/judgement
Y_data = myersBrigs_df.iloc[:, 2:6]
#print(Y_data)

In [17]:
Y_data["Extrovert/Introvert"].value_counts()

0.0    6676
1.0    1999
Name: Extrovert/Introvert, dtype: int64

In [49]:
myersBrigs_df['Introvert/Extrovert'].value_counts()

I    6676
E    1999
Name: Introvert/Extrovert, dtype: int64

In [18]:
# Train and Test set for each column
X_train_IE, X_test_IE, Y_train_IE, Y_test_IE = train_test_split(X_data, Y_data['Extrovert/Introvert'], test_size=0.2, stratify=Y_data)
X_train_NS, X_test_NS, Y_train_NS, Y_test_NS = train_test_split(X_data, Y_data['Intution/Sensation'], test_size=0.2, stratify=Y_data)
X_train_TF, X_test_TF, Y_train_TF, Y_test_TF = train_test_split(X_data, Y_data['Thinking/Feeling'], test_size=0.2, stratify=Y_data)
X_train_JP, X_test_JP, Y_train_JP, Y_test_JP = train_test_split(X_data, Y_data['Perception/Judgement'], test_size=0.2, stratify=Y_data)

## Personaility Prediction Models

### Support Vector Model
Takes 11 mins to run the model not optimal for the app, accuracy is higher then Naive Bayes though

In [19]:
# To train the model we are using a Support Vector Model, for each personaility category

#Extrovert or Introvert prediction
svm1 = SVC(random_state=123, probability=True)
svm1.fit(X_train_IE,Y_train_IE)
ypredIE = svm1.predict(X_test_IE)

#Sensation or Intituion prediction
svm2 = SVC(random_state=123, probability=True)
svm2.fit(X_train_NS,Y_train_NS)
ypredNS = svm2.predict(X_test_NS)

#Thinking or Feeling prediction
svm3 = SVC(random_state=123, probability=True)
svm3.fit(X_train_TF,Y_train_TF)
ypredTF = svm3.predict(X_test_TF)

#Judgement or Perception prediction
svm4 = SVC(random_state=123, probability=True)
svm4.fit(X_train_JP,Y_train_JP)
ypredJP = svm4.predict(X_test_JP)


In [20]:
Y_test_JP

7061    0.0
1272    0.0
3912    1.0
2183    0.0
5647    0.0
       ... 
7796    0.0
2167    1.0
340     1.0
4550    0.0
884     0.0
Name: Perception/Judgement, Length: 1735, dtype: float64

In [72]:
ypredJP[4]

1.0

### Naive Bayes

In [46]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

In [81]:
# Creating a model prediction with 4 models for each category 

def modelPrediction(m1, m2, m3, m4):
    
    # Predict if they are an extrovert or introvert
    ypredEI=m1.fit(X_train_IE,Y_train_IE).best_estimator_.predict(X_test_IE)
    
    # Predict if they are an intution or sensation
    ypredIS=m2.fit(X_train_NS,Y_train_NS).best_estimator_.predict(X_test_NS)

    # Predict if they are thinking or feeling
    ypredTF=m3.fit(X_train_TF,Y_train_TF).best_estimator_.predict(X_test_TF)

    # Predict if they are perception or judgement
    ypredPF=m4.fit(X_train_JP,Y_train_JP).best_estimator_.predict(X_test_JP)
    
    return ypredEI, ypredIS, ypredPF, ypredTF

In [82]:
#Naive Bayes Model
naivegb=GaussianNB()


#Apply stratified cross validation
grid1=GridSearchCV(naivegb,{},cv=5)
grid2=GridSearchCV(naivegb,{},cv=5)
grid3=GridSearchCV(naivegb,{},cv=5)
grid4=GridSearchCV(naivegb,{},cv=5)


#prediction
ypredIE, ypredNS, ypredTF, ypredJP= modelPrediction(grid1, grid2, grid3, grid4)

In [83]:
ypredNS

array([0., 0., 0., 0., 0., 0.])

In [84]:
ypredIE

array([0., 0., 0., 0., 0., 0.])

In [79]:
# How will we predict the user's personality ?
# By having an array called test_input where indexes 0 - 4 are training data from the myers briggs dataframe and index 5 is the user's input we are are predicting 
# These examples have to be pre-processed 

test_input = []

# NEED 4 training posts and 1 testing
test_input.append(myersBrigs_df.iloc[0]["posts"])
test_input.append(myersBrigs_df.iloc[1]["posts"])
test_input.append(myersBrigs_df.iloc[2]["posts"])
test_input.append(myersBrigs_df.iloc[3]["posts"])
test_input.append(myersBrigs_df.iloc[4]["posts"])
test_input.append(myersBrigs_df.iloc[5]["posts"])

# Vectorize this array to be 0 and 1 in a matrix
test_tf_vector = vectorization(test_input)
test_x_data = test_tf_vector