## Import Libraries

In [1]:
# basic libraries
import re
import pandas as pd

In [43]:
# nlp libraries
import nltk
nltk.download('wordnet')
nltk.download("stopwords")
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arpan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arpan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [91]:
# machine learning libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [95]:
# other libraries
import joblib

## Dataframes Defining

In [35]:
# read csv file
df = pd.read_csv("gender_dialogues.csv")

In [36]:
# top 10 rows
df.head(10)

Unnamed: 0,Gender,Dialogue
0,Female,I can't believe he stood me up again.
1,Male,I've been working on this car all weekend.
2,Female,I wish I could find the perfect pair of shoes.
3,Male,I can't wait for the next game; it's gonna be ...
4,Female,Why is it so hard to find good chocolate?
5,Male,I need to hit the gym; these workouts are tough.
6,Female,Girls' night out is exactly what I need.
7,Male,I love the smell of a good barbecue.
8,Female,I wish I could eat dessert every day.
9,Male,I've been binge-watching this new series.


In [37]:
# defining stopwords set and lemma
set_of_stopwords = set(stopwords.words("english"))
lemma = WordNetLemmatizer()

In [38]:
# function to remove contractions and replace by spaces
def remove_contractions(text):

    contractions = ["ain't", "aren't", "can't", "can't've", "'cause", "could've", "couldn't", "couldn't've", "didn't", "doesn't", "don't",
                    "hadn't", "hadn't've", "hasn't", "haven't", "he'd", "he'd've", "he'll", "he'll've", "he's", "how'd", "how'd'y", "how'll", "how's",
                    "I'd", "I'd've", "I'll", "I'll've", "I'm", "I've", "isn't", "it'd", "it'd've", "it'll", "it'll've", "it's", "let's",
                    "ma'am", "mayn't", "might've", "mightn't", "mightn't've", "must've", "mustn't", "mustn't've", "needn't", "needn't've",
                    "o'clock", "oughtn't", "oughtn't've", "shan't", "sha'n't", "shan't've", "she'd", "she'd've", "she'll", "she'll've", "she's",
                    "should've", "shouldn't", "shouldn't've", "so've", "so's", "that'd", "that'd've", "that's", "there'd", "there'd've", "there's",
                    "they'd", "they'd've", "they'll", "they'll've", "they're", "they've", "to've", "wasn't",
                    "we'd", "we'd've", "we'll", "we'll've", "we're", "we've", "weren't", "what'll", "what'll've", "what're", "what's", "what've",
                    "when's", "when've", "where'd", "where's", "where've", "who'll", "who'll've", "who's", "who've", "why's", "why've",
                    "will've", "won't", "won't've", "would've", "wouldn't", "wouldn't've", "y'all", "y'all'd", "y'all'd've", "y'all're", "y'all've",
                    "you'd", "you'd've", "you'll", "you'll've", "you're", "you've", "gonna"]

    for contraction in contractions:
        text = text.replace(contraction, " ")

    return text

In [39]:
# function to remove punctuations
def remove_punctuations(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [70]:
# function to check noun
def is_noun(word):
    synsets = wordnet.synsets(word)
    for synset in synsets:
        if synset.pos() == 'n':
            return True
    return False

In [76]:
# dialogue to cleaned dialogue
def cleaned_dialogue(text):
    if text is None:
        return None
    contractionless_text = remove_contractions(text)
    punctuationless_text = remove_punctuations(contractionless_text)
    tokens = nltk.word_tokenize(punctuationless_text)
    filtered_words = [token.lower() for token in tokens if token.lower() not in set_of_stopwords]
    lemmatized_words = [lemma.lemmatize(word) for word in filtered_words]
    words = [word for word in lemmatized_words if is_noun(word)]
    return ' '.join(words)

In [77]:
# list of words for dialogue
dialogue_list = []
for dialogue in df.Dialogue:
    dialogue_list.append(cleaned_dialogue(dialogue))
df['Cleaned Dialogue'] = dialogue_list

In [98]:
# bottom 10 rows with Cleaned Dialogue
df.tail(10)

Unnamed: 0,Gender,Dialogue,Cleaned Dialogue
540,Female,The fragrance of blooming flowers uplifts my s...,fragrance blooming flower uplift spirit
541,Male,Learning about astronomy deepens my appreciati...,learning astronomy appreciation cosmos
542,Female,I enjoy the challenge of solving challenging S...,challenge solving sudoku puzzle
543,Male,Sailing on calm waters brings a sense of freed...,sailing calm water sense freedom peace
544,Female,I find joy in exploring local farmers' markets...,find joy local farmer market weekend
545,Male,DIY home improvement projects allow me to unle...,home improvement project creativity
546,Female,The aroma of a bakery on a Sunday morning is i...,aroma bakery sunday morning
547,Male,Playing chess is a mental workout that I thoro...,playing chess workout
548,Male,Urban photography allows me to capture the ess...,photography capture essence city life
549,Female,I love the feeling of grass beneath my feet in...,love feeling grass foot park


## Count Vectorizer

In [82]:
# defining count vectorizer
max_features = 5000
count_vectorizer = CountVectorizer(max_features = max_features, stop_words = "english")

In [83]:
# defining sparse matrix
sparse_matrix = count_vectorizer.fit_transform(dialogue_list).toarray()

## Model Training and Testing

In [84]:
# defining X and y
X = sparse_matrix
y = df.iloc[:,0].values

In [86]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [88]:
# defining model
model = GaussianNB()

In [90]:
# model fitting
model.fit(X_train, y_train)

In [94]:
# model testing
y_predict = model.predict(X_test)
print(accuracy_score(y_predict, y_test))

0.8727272727272727


## Model Saving

In [97]:
# saving count vectorizer and Gaussian Naive-Bayes model
joblib.dump(count_vectorizer, 'gender_cv.joblib')
joblib.dump(model, 'gender_model.joblib')

['gender_model.joblib']