### Imported the Following Models

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import stop_words
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import string
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import accuracy_score, plot_roc_curve, roc_auc_score, recall_score, precision_score, f1_score


# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

pd.options.display.max_colwidth = 200

In [None]:
pd.set_option("max_rows", 100)

In [None]:
bro_pill = pd.read_csv('./bro_pill.csv')
male_mental_health = pd.read_csv('./malementalhealth.csv')
mens_lib = pd.read_csv('./Mens_Lib.csv')
ask_men = pd.read_csv('./ask_men.csv')

In [None]:
combined_dfs = [mens_lib, male_mental_health, bro_pill, ask_men]

for df in combined_dfs:
    df.columns = ['title', 'selftext', 'subreddit', 'created_utc',
                  'author', 'num_comments', 'score', 'is_self', 'timestamp']

pd.concat(combined_dfs).reset_index(drop=True)

In [None]:
mens_issues = pd.concat(combined_dfs).reset_index(drop=True)

In [None]:
mens_issues = mens_issues.drop(columns = ['created_utc', 'num_comments', 'score', 'is_self', 'timestamp'])

In [None]:
mens_issues.shape

#### Removing autobots from the dataset

In [None]:
mens_issues = mens_issues[(mens_issues['author'] != 'MLModBot')]
mens_issues = mens_issues[(mens_issues['author'] != 'Mr_Holmes')]
mens_issues = mens_issues[(mens_issues['author'] != 'Dewey_Darl')] 
mens_issues = mens_issues[(mens_issues['author'] != 'AutoModerator')]

In [None]:
mens_issues.head()

### String Cleaning

Combining title and self text to ensure better text quality of the documents

In [None]:
mens_issues.head()

Replacing `NaN` with an empty string so as concatenating the title and author will perform correctly.

In [None]:
mens_issues['selftext'].fillna(" ", inplace=True)

In [None]:
#function to remove string, specifically the [removed] string under the selftext column
def remove_string(df, column, string):
    df[column] = df[column].str.replace(string, '')

remove_string(mens_issues, 'selftext', '\[removed\]')

In [None]:
mens_issues['all_text'] = mens_issues['title'].str.cat(mens_issues['selftext'], sep=" ")

Will create a function to normalize and standardize the text within the subreddits.

In [None]:
#Credits to Gwen for function
def clean_strings(input_list, stopwords = []):
    import re # we'll use regex to strip urls
    output_list = [] # create output list
    stopwords = [word.lower() for word in stopwords] # ensure case insensitivity for stopwords
    for sentence in input_list:
        sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(sentence), flags=re.MULTILINE) # remove URLS per stackoverflow
        sentence = sentence.replace('\n',' ') # replace \n and \t with spaces as they represent breaks between words
        sentence = sentence.replace('\t',' ')
        sentence = ''.join([letter for letter in sentence if letter.isalpha() or letter == ' ']) #remove numbers and punctuation
        sentence = ' '.join([word for word in sentence.split() if len(word) < 25]) #exclude egregiously long words
        sentence = ' '.join([word.lower() for word in sentence.split() if word.lower() not in stopwords]) #coerce to lowercase while removing stopwords
        sentence = re.sub(r'\b\w{1,3}\b', '', str(sentence)) #removing words that are less than 3 letters long
        sentence = re.sub('[‘’“”…]', '', sentence)
        sentence = re.sub('\n', '', sentence)
        output_list.append(sentence) # add to the output list
    return output_list

In [None]:
#adding stopwords that I deem to be insufficient for my analysis
mens_issues['all_text'] = clean_strings(mens_issues['all_text'], stopwords = stop_words.stop_words)

In [None]:
#checking for nulls
mens_issues.isna().sum()

#### Lemmatize

In [None]:
#created function to lemmatize each word in all text
lemmatizer = WordNetLemmatizer()

def lemmatize_words(txt):
    """lemmatize words accepts a string"""
    words = txt.split()
    lem_words = ''
    for word in words:
        lem_words += (lemmatizer.lemmatize(word) + ' ')
    return lem_words

In [None]:
#saving this version for topic modeling in the future
#mens_issues.to_csv('./men_df_lem.csv', index=False)

#### Stemmatizer

In [None]:
#created function to stemmatize each word in all text
stemmatizer = PorterStemmer()

def stemmatize_words(txt):
    """stemmatize words accepts a string"""
    words = txt.split()
    stem_words = ''
    for word in words:
        stem_words += (stemmatizer.stem(word) + ' ')
    return stem_words

In [None]:
mens_issues['all_text'] = mens_issues['all_text'].apply(stemmatize_words)

In [None]:
#saving this version for topic modeling in the future
#mens_issues.to_csv('./men_df_stem.csv', index=False)

### Exploratory Data Analysis on Mens' Issues

In [None]:
#investigating the variety of string length
mens_issues['post_length'] = mens_issues['all_text'].str.len()

In [None]:
g = sns.displot(mens_issues, x="post_length", legend=False, element="step", bins = 500)
plt.xlim(-1,2_000)
plt.title("Post Length Distribution per Subreddit\nexcluding outliers (>500)")
plt.show(g)
plt.tight_layout()

We see majority of the posts fall within the 0-100 words bucket, yet we can't ignore the posts that exceed the majority amount

In [None]:
cvec_eda = CountVectorizer(stop_words = 'english', max_features = 20_000)

X_eda = mens_issues['all_text']
X_eda = cvec_eda.fit_transform(X_eda)
X_eda_df = pd.DataFrame(X_eda.todense(), columns = cvec_eda.get_feature_names())

In [None]:
X_eda_df.shape

In [None]:
X_eda_df.sum().sort_values(ascending = False).head(15)

In [None]:
plt.figure(figsize=(8,4))
X_eda_df.sum().sort_values(ascending=False).head(10).plot(kind='barh');
plt.savefig('eda_words.png', dpi=300)
plt.show()

#### Exploring Most Common Words with Varying String Lengths

In [None]:
#updating function to filter out words that are less than 4 letters long
def clean_strings2(input_list, stopwords = []):
    import re # we'll use regex to strip urls
    output_list = [] # create output list
    stopwords = [word.lower() for word in stopwords] # ensure case insensitivity for stopwords
    for sentence in input_list:
        sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(sentence), flags=re.MULTILINE) # remove URLS per stackoverflow
        sentence = sentence.replace('\n',' ') # replace \n and \t with spaces as they represent breaks between words
        sentence = sentence.replace('\t',' ')
        sentence = ''.join([letter for letter in sentence if letter.isalpha() or letter == ' ']) #remove numbers and punctuation
        sentence = ' '.join([word for word in sentence.split() if len(word) > 4])
        sentence = ' '.join([word.lower() for word in sentence.split() if word.lower() not in stopwords]) #coerce to lowercase while removing stopwords
        sentence = re.sub(r'\b\w{1,3}\b', '', str(sentence)) #removing words that are less than 3 letters long
        sentence = re.sub('[‘’“”…]', '', sentence)
        sentence = re.sub('\n', '', sentence)
        output_list.append(sentence) # add to the output list
    return output_list

In [None]:
#adding stopwords that I deem to be insufficient for my analysis
mens_issues['all_text2'] = clean_strings2(mens_issues['all_text'], stopwords = stop_words.stop_words)

In [None]:
cvec_eda2 = CountVectorizer(stop_words = 'english', max_features = 10_000)

X_eda = mens_issues['all_text2']
X_eda = cvec_eda2.fit_transform(X_eda)
X_eda_df = pd.DataFrame(X_eda.todense(), columns = cvec_eda2.get_feature_names())

In [None]:
X_eda_df.sum().sort_values(ascending = False).head(15)

In [None]:
plt.figure(figsize=(8,4))
X_eda_df.sum().sort_values(ascending=False).head(10).plot(kind='barh');
#plt.savefig('eda_words2.png', dpi=300)
plt.show()

#### Sentiment Analysis

In [None]:
#initializing corpus variable to list out the mens_issues['all_text'] column
corpus = list(mens_issues['all_text'])

In [None]:
sia = SentimentIntensityAnalyzer()

In [None]:
#Self-Checking polarity scores for the 10th document
sia.polarity_scores(corpus[9])

In [None]:
sentiment = []    

for el in corpus:
    scores = sia.polarity_scores(el)
    scores['alltext'] = el
    sentiment.append(scores)

df_sentiment = pd.DataFrame(sentiment)
df_sentiment.head(10)

In [None]:
#initializing marker column 
df_sentiment['score'] = 0

In [None]:
#plotting the average positive and negative scores for all posts
df_sentiment.groupby('score').mean()[['pos', 'neg']].plot(kind='barh');
plt.title('Average Positive, Negative & Compound Scores');
plt.ylabel('All Posts');
plt.xlabel('Sentiment Score');

In [None]:
#investigating posts with high negative sentiment scores
df_sentiment[['neg', 'pos', 'alltext']].sort_values('neg', ascending=False).head(10)

In [None]:
#investigating posts with high sentiment scores
df_sentiment[['neg', 'pos', 'alltext']].sort_values('pos', ascending=False).head(10)

By investigating the posts with high negative and positive sentiment scores, we see that most posts are solely one-worded posts. Will filter these posts out to get a clearer idea of what most men are posting on these subreddits.

In [None]:
#created column where it returns either True or False whether or not the string length is more than 50 characters long
df_sentiment['alltext_length'] = df_sentiment['alltext'].str.len() >= 50

In [None]:
#Created another dataframe where string length is more than 50
second_df_sentiment = df_sentiment[df_sentiment['alltext_length'] == True]

In [None]:
posi_df = second_df_sentiment[['neg', 'pos', 'alltext']].sort_values('pos', ascending=False).head(5000)

In [None]:
cvec_eda_pos = CountVectorizer(stop_words = 'english', max_features = 20_000)

posi_eda = posi_df['alltext']
posi_eda = cvec_eda_pos.fit_transform(posi_eda)
posi_eda_df = pd.DataFrame(posi_eda.todense(), columns = cvec_eda_pos.get_feature_names())

In [None]:
#self-checking values
posi_eda_df.sum().sort_values(ascending = False).head(15)

In [None]:
plt.figure(figsize=(12,7))
posi_eda_df.sum().sort_values(ascending=False).head(10).plot(kind='barh');
plt.savefig('posi_eda_words.png', dpi=200)
plt.show();
plt.tight_layout();

In [None]:
second_df_sentiment[['neg', 'pos', 'alltext']].sort_values('neg', ascending=False)

In [None]:
neg_df = second_df_sentiment[['neg', 'pos', 'alltext']].sort_values('neg', ascending=False).head(5000)

In [None]:
cvec_eda_neg = CountVectorizer(stop_words = 'english', max_features = 20_000)

neg_eda = neg_df['alltext']
neg_eda = cvec_eda_neg.fit_transform(neg_eda)
neg_eda_df = pd.DataFrame(neg_eda.todense(), columns = cvec_eda_neg.get_feature_names())

In [None]:
neg_eda_df.sum().sort_values(ascending = False).head(15)

In [None]:
plt.figure(figsize=(12,7), dpi=100)
neg_eda_df.sum().sort_values(ascending=False).head(15).plot(kind='barh');
plt.savefig('neg_eda_words.png', dpi=200)
plt.show();
plt.tight_layout();

#### Implementing N-grams

In this section of the EDA, I will explore the top bi-grams and tri-grams within the dataset

In [None]:
from nltk.util import ngrams

In [None]:
corpus=[]
men_say= mens_issues['all_text'].str.split()
men_say=mens_issues.values.tolist()
corpus=[word for i in men_say for word in i]

In [None]:
def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) 
                  for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:10]

In [None]:
mens_issues['all_text'] = clean_strings(mens_issues['all_text'], stopwords = stop_words.stop_words)

In [None]:
import seaborn as sns
import numpy as np
from matplotlib import pyplot
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from collections import  Counter

def plot_top_ngrams_barchart(text, n=2):
    stop_words=set(stopwords.words('english'))
    new_stopwords=['have', 'been', 'this', 'that', 'have', 'have been', 'they were', 'away from', 'they', 'rules still apply', 'even', 'though', 'even though', 'amazon gift card']
    new_stopwords_list = stop_words.union(new_stopwords)

    men_say= mens_issues['all_text'].str.split()
    men_say=mens_issues.values.tolist()
    corpus=[word for i in men_say for word in i if word not in new_stopwords_list]

    def _get_top_ngram(corpus, n=None):
        vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
        bag_of_words = vec.transform(corpus)
        sum_words = bag_of_words.sum(axis=0) 
        words_freq = [(word, sum_words[0, idx]) 
                      for word, idx in vec.vocabulary_.items()]
        words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
        return words_freq[:10]

    top_n_bigrams=_get_top_ngram(text,n)[:10]
    x,y=map(list,zip(*top_n_bigrams))
    sns.set(rc={'figure.figsize':(11.7,8.27)})
    sns.barplot(x=y,y=x)

In [None]:
plot_top_ngrams_barchart(mens_issues['all_text'],2)

We can see that the majority of words appear to be referring to so something or someone. What really stood out here is "high school" being a top bi-gram. Furthermore, it appears that sexual assault, social media, and toxic masculinity are top words being represented in these sub reddit posts

In [None]:
plot_top_ngrams_barchart(mens_issues['all_text'],3)

In [None]:
mens_issues.head()

In [None]:
mens_issues.to_csv('./men_df.csv')

In [None]:
mens_issues.isnull().sum()

_____

### Topic Modeling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(mens_issues.all_text)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = mens_issues.index
data_dtm.head()

In [None]:
from gensim import matutils, models
import scipy.sparse

In [None]:
tdm = mens_issues.transpose()
tdm.head()

In [None]:
tdm.drop(['title', 'selftext'], axis=0, inplace=True)
tdm.drop(['subreddit', 'author'], axis=0, inplace=True)

In [None]:
import pickle

In [None]:
sparse_counts = scipy.sparse.csr_matrix(data_dtm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [None]:
corpus

In [None]:
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

add_stop_words = ['im', 'just', 'like', "don\'t", 'feel', 'men', 'people', 'time', 'dont', 'make', 'know', 'ive', 'really', 'removed', 'nan', 'got', 'did', 'lot', 'have', 'been']

stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(mens_issues.all_text)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = mens_issues.index

# Pickle it for later use
pickle.dump(cv, open("cv_stop.pkl", "wb"))

In [None]:
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [None]:
cv

In [None]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

In [None]:
### passing the model with lower passes, increased num_topics
lda2 = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=10, passes=5)
lda2.print_topics()

In [None]:
### passing the model with lower passes, increased num_topics
lda3 = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, passes=5)
lda3.print_topics()

Based on these passes of topic modeling via LDA, it's difficult to interpret the topics that we have here. Will adjust the words that will be passed through the model 

#### Topic Modeling - Nouns

In [None]:
#credits to Alice Zhao
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [None]:
mens_issues['all_text'] = mens_issues.all_text.apply(nouns)

In [None]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['im', 'just', 'like', "don\'t", 'feel', 'men', 'people', 'time', 'dont', 'make', 'know', 'ive', 'really', 'removed', 'nan', 'got', 'did', 'lot', 'have', 'been', 'askmen', 'reddit']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(mens_issues.all_text)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
#data_dtmn

In [None]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [None]:
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

Based on first pass, words like "askmen" and "reddit" shouldn't be a result for topic modeling, will add that to the stop words list.\
Overall, much more cogent words are being produced, but nothing really stands out where I can definitively say that the result being produced is a particular topic. 

In [None]:
#Pass 2, excluding words of askmen and reddit
ldan2 = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan2.print_topics()

This pass on the model seems more promising. It appears that the first topic is in regards of social relationships where as the second second topic appears to be about the physical features of men (hair, body, masculinity)

In [None]:
#Pass 3, increasing num_topics and passes
ldan2 = models.LdaModel(corpus=corpusn, num_topics=5, id2word=id2wordn, passes=15)
ldan2.print_topics()

In [None]:
#It appears that the same words are being reflected "years", "year", "women", "woman". Will Lemmatize

In [None]:
#Pass 4, increasing num_topics and passes
ldan3 = models.LdaModel(corpus=corpusn, num_topics=5, id2word=id2wordn, passes=15)
ldan3.print_topics()

In [None]:
#It seems like I still gotta few more words that I need to add into the stop words list

In [None]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [None]:
#pass 5
ldan4 = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan4.print_topics()

In [None]:
#looks promising, will increase num_topics to 5, 7 and 10 and see if anything substantial shows

In [None]:
#pass 6
ldan5 = models.LdaModel(corpus=corpusn, num_topics=5, id2word=id2wordn, passes=5)
ldan5.print_topics()

In [None]:
#pass 7
ldan6 = models.LdaModel(corpus=corpusn, num_topics=5, id2word=id2wordn, passes=10)
ldan6.print_topics()

In [None]:
#pass 8 (with stemmatizer)
ldan7 = models.LdaModel(corpus=corpusn, num_topics=5, id2word=id2wordn, passes=10)
ldan7.print_topics()

_____

#### Topic Modeling - Attempt#3 (Nouns and Adjectives)

In [None]:
# Function to pull out nuns and adjectives 
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [None]:
data_nouns_adj = pd.DataFrame(mens_issues.all_text.apply(nouns_adj))
data_nouns_adj.head()

In [None]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df


add_stop_words = ['im', 'just', 'like', "don\'t", 'feel', 'men', 'people', 'time', 'dont', 'make', 'know', 'ive', 'really', 'removed', 'nan', 'got', 'did', 'lot', 'have', 'been', 'askmen', 'reddit', 'whats', 'question', 'thing', 'things', 'thats', 'that', 'shes', 'year', 'years', 'youv', 'because', 'talk', 'month', 'post', 'anyone', 'anyon']
stop_words_revised = add_stop_words + stop_words

cvna = CountVectorizer(stop_words=stop_words_revised, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.all_text)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna.head()

In [None]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [None]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

In [None]:
#Pass2- Let's try 5 topics, 20 passes
ldana2 = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=20)
ldana2.print_topics()

In [None]:
#Pass 3 - Let's try 5 topics, 30 passes
ldana2 = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=30)
ldana2.print_topics()

1. Dating Advice
2. ?? 
3. ??

In [None]:
#Pass 3 - Let's try 5 topics, 40 passes
ldana2 = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=40)
ldana2.print_topics()

In [None]:
#Pass 4 - Let's try 5 topics, 50 passes
ldana3 = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=50)
ldana3.print_topics()

In [None]:
#Topic 1 --> Sexual Assault/Trauma
#Topic 2 --> Stuck in a rut
#Topic 3 --> Body insecurities
#Topic 4 --> Relationship Advice
#Topic 5 --> Gender Roles

In [None]:
#Pass 5 - Let's try 5 topics, 25 passes
ldana4 = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=25)
ldana4.print_topics()

In [None]:
#Pass 6 - Let's try 6 topics, 50 passes
ldana5 = models.LdaModel(corpus=corpusna, num_topics=6, id2word=id2wordna, passes=50)
ldana5.print_topics()

In [None]:
#Pass 7 - Let's try 5 topics, 80 passes
ldana7 = models.LdaModel(corpus=corpusna, num_topics=6, id2word=id2wordna, passes=80)
ldana7.print_topics()

In [None]:
# Let's try 7 topics, 20 passes
ldana3 = models.LdaModel(corpus=corpusna, num_topics=7, id2word=id2wordna, passes=20)
ldana3.print_topics()

In [None]:
# Let's try 7 topics, 30 passes
ldana4 = models.LdaModel(corpus=corpusna, num_topics=7, id2word=id2wordna, passes=30)
ldana4.print_topics()

In [None]:
# words like: Years, shes, didnt don't contribute much to my analysis, will be adding them to the stopwords list and attempt again with the model.

In [None]:
#let's try 6 topics
