# Reddit dataset analysis

Here's the plan for the upcoming work in the notebook --

- CountVectorizer (vector representation of text) on N-grams (vary the N)
- Train a Naive Bayes Classifier
- Look at most significant features
- Iteratively improve feature selection

In [1]:
import pandas as pd

loaded_df = pd.read_pickle('../../data_samples/reddit_samples/all.pkl')
loaded_df


Unnamed: 0,text,age
0,What happened to my comment....it was soo good...,te
1,"A shit ton of censorship. And I don't mean ""de...",te
2,Wasn't aware of the drama between /r/askmen an...,te
3,Nice username I too am from Finland,te
4,Your comment was on the [other post]( lol,te
...,...,...
64861,"And, after 10 years of marriage, you can get 5...",th
64862,Yes. Thank you for this response. I don’t view...,th
64863,Better hope that you're contacted before someo...,th
64864,Thank you for this question. I also find mysel...,th


In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


def train_classifier(texts, labels):
    # create a CountVectorizer with character-based bigrams
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(2,2), lowercase=False, stop_words=None)
    X = vectorizer.fit_transform(texts)
    # train the classifier and return it
    clf = MultinomialNB()
    clf.fit(X, labels)
    return clf, vectorizer

def train_word_classifier(texts, labels):
    # create a CountVectorizer with words
    vectorizer = CountVectorizer(lowercase=False, stop_words=None)
    X = vectorizer.fit_transform(texts)
    # train the classifier and return it
    clf = MultinomialNB()
    clf.fit(X, labels)
    return clf, vectorizer

def predict_age_group(classifier, vectorizer, new_text):
    # take in a classifier as input and return the prediction
    new_X = vectorizer.transform([new_text])
    predicted_age_group = classifier.predict(new_X)
    return predicted_age_group

def evaluate_classifier(classifier, vectorizer, test_texts, test_labels):
    # transform the test data
    X_test = vectorizer.transform(test_texts)
    # predict the age group and return score
    predicted_age_groups = classifier.predict(X_test)
    return accuracy_score(test_labels, predicted_age_groups)


In [19]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(list(loaded_df['text']), list(loaded_df['age']), test_size=0.2)

# Train a classifier on the training data
clf, vectorizer = train_classifier(train_texts, train_labels)

# Evaluate the classifier on the test data
accuracy = evaluate_classifier(clf, vectorizer, test_texts, test_labels)

# Print the accuracy score
print("Accuracy:", accuracy)

Accuracy: 0.6040542623708957


In [20]:
N = 10
feature_names = list(vectorizer.vocabulary_.keys())
log_prob = clf.feature_log_prob_
top_N_features = []
for i in range(clf.classes_.shape[0]):
    top_N_indices = log_prob[i].argsort()[::-1][:N]
    top_N_features.extend([feature_names[idx] for idx in top_N_indices])

print("Top {} most significant textual features:".format(N))
print(top_N_features)

Top 10 most significant textual features:
['͑͠', 'er', 'k░', 'wi', 'u\\', '†h', ' 💚', '̎͑', '🥵 ', 've', '͑͠', 'er', 'k░', 'wi', 'u\\', '†h', '🥵 ', '̎͑', ' 💚', '𝕌ς', '͑͠', 'er', 'k░', 'wi', '†h', 'u\\', '🥵 ', '̎͑', ' 💚', 'ₜ ']


In [22]:
# Train a classifier on the training data
word_clf, word_vectorizer = train_word_classifier(train_texts, train_labels)

# Evaluate the classifier on the test data
accuracy = evaluate_classifier(word_clf, word_vectorizer, test_texts, test_labels)

# Print the accuracy score
print("Accuracy:", accuracy)

Accuracy: 0.6983197163557885


In [23]:
N = 10
feature_names = list(word_vectorizer.vocabulary_.keys())
log_prob = word_clf.feature_log_prob_
top_N_features = []
for i in range(clf.classes_.shape[0]):
    top_N_indices = log_prob[i].argsort()[::-1][:N]
    top_N_features.extend([feature_names[idx] for idx in top_N_indices])

print("Top {} most significant textual features:".format(N))
print(top_N_features)

Top 10 most significant textual features:
['raisins', 'fostered', 'unexpectednaruto', 'irreversible', 'Shel', 'itty', 'Hitting', 'KING', 'Newtech', 'goddammit', 'raisins', 'fostered', 'unexpectednaruto', 'Shel', 'Hitting', 'irreversible', 'itty', 'KING', 'goddammit', 'Else', 'raisins', 'fostered', 'unexpectednaruto', 'Shel', 'irreversible', 'Hitting', 'goddammit', 'itty', 'KING', 'Else']


In [5]:
import numpy as np

In [24]:
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def train_ensemble_classifier(texts, labels):
    # train the character-level bigram model
    clf_char, vec_char = train_classifier(texts, labels)

    # train the word-level model
    clf_word, vec_word = train_word_classifier(texts, labels)

    # get the predicted probabilities of the base classifiers
    X_char = vec_char.transform(texts)
    X_word = vec_word.transform(texts)
    proba_char = clf_char.predict_proba(X_char)
    proba_word = clf_word.predict_proba(X_word)

    # horizontally stack the predicted probabilities
    proba_combined = np.hstack([proba_char, proba_word])

    # train the meta-classifier
    meta_clf = LogisticRegression()
    meta_clf.fit(proba_combined, labels)

    return clf_char, clf_word, meta_clf, vec_char, vec_word


def evaluate_ensemble_classifier(clf_char, clf_word, meta_clf, vec_char, vec_word, test_texts, test_labels):
    # transform the test data
    X_char = vec_char.transform(test_texts)
    X_word = vec_word.transform(test_texts)

    # get the predicted probabilities of the base classifiers
    proba_char = clf_char.predict_proba(X_char)
    proba_word = clf_word.predict_proba(X_word)

    # horizontally stack the predicted probabilities
    proba_combined = np.hstack([proba_char, proba_word])

    # predict the labels using the meta-classifier
    predicted_labels = meta_clf.predict(proba_combined)

    return accuracy_score(test_labels, predicted_labels)


In [25]:
# train the ensemble classifier
clf_char, clf_word, meta_clf, vec_char, vec_word = train_ensemble_classifier(train_texts, train_labels)

# preprocess the test data using the vectorizers and evaluate the classifier
accuracy = evaluate_ensemble_classifier(clf_char, clf_word, meta_clf, vec_char, vec_word, test_texts, test_labels)

# print the accuracy score
print('Accuracy: {:.2f}%'.format(accuracy * 100))

Accuracy: 70.33%


In [None]:
N = 10
feature_names = list(word_vectorizer.vocabulary_.keys())
log_prob = word_clf.feature_log_prob_
top_N_features = []
for i in range(clf.classes_.shape[0]):
    top_N_indices = log_prob[i].argsort()[::-1][:N]
    top_N_features.extend([feature_names[idx] for idx in top_N_indices])

print("Top {} most significant textual features:".format(N))
print(top_N_features)

In [26]:
train_texts[0:5]

['That’s horrible for him to say something like that, It’s good that you aren’t friends with him anymore. Also, him saying that he is average weight for Americans isn’t a good thing, average weight for Americans is likely obese, or at best overweight.',
 'Wow, I missed this in my first read through. The way he’s treating OP is awful, but this is actually horrifying and the clearest indicator that this dude is a bad, bad person.',
 "Not op but Canvas is awesome. It connects directly to Google drive, it's straightforward, it's well designed, AND it has a usable app. A+",
 'What the fuck do you guys do if there is a nonbinary',
 'the first comment was great, also when she is honest like says she feels unsafe or having lots of anxiety def take her out of the situation, if she goes out always pick her up when she asks (could get her out of an anxiety driven situation)']

In [42]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

def preprocess_texts(texts):
    preprocessed_texts = []
    
    # create stop words list and instantiate stemmer and lemmatizer
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    for text in texts:
        # lowercase the text
        text = text.lower()
        
        # remove URLs and email addresses
        text = re.sub(r'http\S+|www\S+|https\S+|ftp\S+|@\S+', '', text, flags=re.MULTILINE)
        
        # remove non-alphanumeric characters except for spaces
        text = re.sub(r'[^a-z0-9\s]', '', text)
        
        # remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # tokenize the text
        tokens = word_tokenize(text)
        
        # remove stop words
        tokens = [token for token in tokens if token not in stop_words]
        
        # stem the tokens
        tokens = [stemmer.stem(token) for token in tokens]
        
        # lemmatize the tokens
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
        # re-join the tokens into a single string
        preprocessed_text = ' '.join(tokens)
        
        preprocessed_texts.append(preprocessed_text)
    
    return preprocessed_texts


In [43]:
pre = preprocess_texts(train_texts)

In [44]:
pre_test = preprocess_texts(test_texts)

In [45]:
# train the ensemble classifier
clf_char, clf_word, meta_clf, vec_char, vec_word = train_ensemble_classifier(pre, train_labels)

# preprocess the test data using the vectorizers and evaluate the classifier
accuracy = evaluate_ensemble_classifier(clf_char, clf_word, meta_clf, vec_char, vec_word, pre_test, test_labels)

# print the accuracy score
print('Accuracy: {:.2f}%'.format(accuracy * 100))

Accuracy: 69.23%


In [46]:
# Train a classifier on the training data
word_clf, word_vectorizer = train_word_classifier(pre, train_labels)

# Evaluate the classifier on the test data
accuracy = evaluate_classifier(word_clf, word_vectorizer, pre_test, test_labels)

# Print the accuracy score
print("Accuracy:", accuracy)

Accuracy: 0.6933867735470942


In [47]:
# Train a classifier on the training data
clf, vectorizer = train_classifier(pre, train_labels)

# Evaluate the classifier on the test data
accuracy = evaluate_classifier(clf, vectorizer, pre_test, test_labels)

# Print the accuracy score
print("Accuracy:", accuracy)

Accuracy: 0.5701402805611222


In [53]:
import spacy
import string
from collections import Counter


nlp = spacy.load("en_core_web_sm")

def analyze_sentence_length(text):
    """
    tokenizes into sentences and returns list of lengths
    """
    doc = nlp(text)
    sentence_lengths = [len(sent) for sent in doc.sents]
    return sentence_lengths


def analyze_words(text):
    """
    tokenizes into words and returns the vocabulary size
    """
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
    vocabulary_size = len(set(tokens))
    return vocabulary_size


def analyze_syntax(text):
    """
    tokenizes the given text into words
    applies POS tagging
    returns a dictionary containing frequency count of each tag
    """
    doc = nlp(text)
    tag_counts = Counter([token.pos_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space])
    return tag_counts


def analyze_caps(text):
    """
    tokenizes into words and identifies capitalized words
    """
    doc = nlp(text)
    capitalized_words = [token.text for token in doc if token.text.isupper() and not token.is_stop and not token.is_punct and not token.is_space]
    return capitalized_words


def word_freqs(text):
    """
    tokenizes into words and counts the frequency of each word
    """
    doc = nlp(text)
    words = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
    word_frequencies = dict(Counter(words))
    return word_frequencies


def analyze(text):
    """
    returns everything
    """
    return {
        "sentence_lengths": analyze_sentence_length(text),
        "vocabulary_size": analyze_words(text),
        "pos_tag_counts": analyze_syntax(text),
        "capitalized_words": analyze_caps(text),
        "word_frequencies": word_freqs(text)
    }


In [73]:
analyze("hi my name is varun")

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer

# Define a function to extract the features from a single document
def analyze_single(text):
    return {
        'sentence_lengths': [len(sent) for sent in nlp(text).sents],
        'vocabulary_size': len(set(nlp(text).text.lower().split())),
        'pos_tag_counts': [token.pos_ for token in nlp(text)],
        'capitalized_words': [token.text for token in nlp(text) if token.is_title],
        'word_frequencies': [token.text for token in nlp(text)]
    }

# Define a function transformer to apply the feature extraction function to each document in the dataset
extract_features = FunctionTransformer(lambda texts: [analyze_single(text) for text in texts])

# Define a pipeline that combines the feature extraction and vectorization steps
pipeline = Pipeline([
    ('extract_features', extract_features),
    ('vectorize', DictVectorizer()),
])

# Apply the pipeline to the training data to extract and vectorize the features
X_train = pipeline.fit_transform(train_texts[0:10000])
y_train = train_labels[0:10000]

# Print the feature matrix
print(X_train.toarray())


TypeError: Unsupported type <class 'int'> in iterable value. Only iterables of string are supported.

In [70]:
"""
A bunch of helper functions for analysis of text, utilized by an effective main.
The plan is to use it for feature engineering / quantitative insights on code.
"""

import string

def count_double_spaces(text):
    """
    number of occurrences of double spaces in the given text (thanks Dr. Han!)
    """
    words = text.split()
    count = sum([1 for w in words if '  ' in w])
    return count

  
def count_punctuation(text):
    """
    number of punctuation marks in the given text.
    """
    return sum([1 for c in text if c in string.punctuation])


def calculate_punct_density(text):
    """
    punctuation density (# of punctuation / len of text)
    """
    total_chars = len(text)
    punct_count = count_punctuation(text)
    return 0 if total_chars == 0 else punct_count / total_chars


def count_exclamation_marks(text):
    """
    number of exclamation marks
    """
    return text.count('!')


def count_question_marks(text):
    """
    number of question marks
    """
    return text.count('?')


def calculate_exclamation_ratio(text):
    """
    ratio of exclamation marks to the total number of punctuation marks
    """
    punct_count = count_punctuation(text)
    exclamation_count = count_exclamation_marks(text)
    return 0 if punct_count == 0 else exclamation_count / punct_count


def calculate_question_ratio(text):
    """
    ratio of question marks to the total number of punctuation marks
    """
    punct_count = count_punctuation(text)
    question_count = count_question_marks(text)
    return 0 if punct_count == 0 else question_count / punct_count


def analyze(text):
    """
    returns everything
    """
    return {
        "punct_count": count_punctuation(text),
        "punct_density": calculate_punct_density(text),
        "exclamation_ratio": calculate_exclamation_ratio(text),
        "question_ratio": calculate_question_ratio(text),
        "double_spaces": count_double_spaces(text),
    }


In [71]:
def extract_features(texts):
    """
    extract features from a list of texts using the analyze function
    """
    features = []
    for text in texts:
        analysis = analyze(text)
        features.append([
            analysis["punct_count"],
            analysis["punct_density"],
            analysis["exclamation_ratio"],
            analysis["question_ratio"],
            analysis["double_spaces"],
        ])
    return np.array(features)

def train_model(X_train, y_train):
    """
    trains a Multinomial Naive Bayes model on the training data
    """
    model = MultinomialNB()
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """
    evaluates the accuracy of the model on the testing data
    """
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy


# extract features from the training data
X_train = extract_features(train_texts)
y_train = train_labels

# train the model
model = train_model(X_train, y_train)

# extract features from the testing data
X_test = extract_features(test_texts)
y_test = test_labels

# evaluate the model on the testing data
accuracy = evaluate_model(model, X_test, y_test)

# print the accuracy
print(f"Accuracy: {accuracy}")


Accuracy: 0.45267457992908894


In [72]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import FeatureUnion



# create a pipeline that first extracts hand-crafted features,
# then vectorizes the texts using CountVectorizer, and finally applies
# feature selection using SelectKBest with chi2 and scaling using StandardScaler,
# before training a Multinomial Naive Bayes classifier
pipeline = Pipeline([
    ('extract_features', FeatureUnion([
        ('handcrafted', FunctionTransformer(extract_features)),
        ('vectorize', CountVectorizer()),
    ])),
    ('feature_selection', SelectKBest(chi2, k=1000)),
    ('scaling', StandardScaler(with_mean=False)),
    ('classification', MultinomialNB()),
])

# train the model
pipeline.fit(train_texts, train_labels)

# evaluate the model on the testing data
y_pred = pipeline.predict(test_texts)
accuracy = accuracy_score(test_labels, y_pred)

# print the accuracy
print(f"Accuracy: {accuracy}")


Accuracy: 0.6577770926468322
