In [None]:
# Install the necessary libraries
!pip install pandas scikit-learn scipy

import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import ttest_ind, chi2_contingency, norm

import time

# Reading data
file_path = '/content/twitch_reviews.csv'
df = pd.read_csv(file_path)
texts = df['content'].dropna().tolist()

# Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

texts = [preprocess_text(text) for text in texts]

# Generate bigrams
def generate_bigrams(texts):
    vectorizer = CountVectorizer(ngram_range=(2, 2))
    X = vectorizer.fit_transform(texts)
    bigrams = vectorizer.get_feature_names_out()
    counts = X.toarray().sum(axis=0)
    return list(zip(bigrams, counts))

# Generate true_bigrams.txt file
def generate_true_bigrams_file(bigrams_counts, output_path):
    with open(output_path, 'w') as file:
        for bigram, count in bigrams_counts:
            file.write(f"{bigram}\n")

# Frequency method
def frequency_method(bigrams):
    threshold = 5  # Set threshold
    return [bigram for bigram, count in bigrams if count > threshold]

# Mean-variance method
def mean_variance_method(bigrams):
    counts = [count for bigram, count in bigrams]
    mean = np.mean(counts)
    variance = np.var(counts)
    return [bigram for bigram, count in bigrams if count > mean + variance]

# Z-test method
def z_test_method(bigrams, total_words):
    results = []
    for bigram, count in bigrams:
        p = count / total_words
        z_score = (p - 0.0001) / np.sqrt((0.0001 * (1 - 0.0001)) / total_words)
        if z_score > norm.ppf(0.95):  # 95% confidence level
            results.append(bigram)
    return results

# T-test method
def t_test_method(bigrams):
    results = []
    counts = np.array([count for bigram, count in bigrams])
    for bigram, count in bigrams:
        t_stat, p_value = ttest_ind(counts, [count], equal_var=False)
        if p_value < 0.05:  # 95% confidence level
            results.append(bigram)
    return results

# Chi-square test method
def chi_square_method(bigrams, total_words):
    results = []
    for bigram, count in bigrams:
        observed = np.array([[count, total_words - count], [total_words - count, total_words]])
        chi2, p, dof, ex = chi2_contingency(observed, correction=False)
        if p < 0.05:  # 95% confidence level
            results.append(bigram)
    return results

# Count the total number of words
total_words = sum([len(text.split()) for text in texts])

# Generate bigrams and counts
bigrams_counts = generate_bigrams(texts)

# Generate true_bigrams.txt file
true_bigrams_path = '/content/true_bigrams.txt'
generate_true_bigrams_file(bigrams_counts, true_bigrams_path)

# Reading true_bigrams.txt file
with open(true_bigrams_path, 'r') as file:
    true_bigrams = set(file.read().splitlines())

# Initialize the result table
results = {
    'Method': [],
    'Bigrams': [],
    'Time (s)': [],
    'Accuracy': []
}

# Evaluate the run time of each method and obtain bigrams
methods = {
    'Frequency': frequency_method,
    'Mean-Variance': mean_variance_method,
    'Z-Test': z_test_method,
    'T-Test': t_test_method,
    'Chi-Square': chi_square_method
}

def calculate_accuracy(predicted_bigrams, true_bigrams):
    correct_bigrams = set(predicted_bigrams).intersection(true_bigrams)
    accuracy = len(correct_bigrams) / len(predicted_bigrams) if predicted_bigrams else 0
    return accuracy

for method_name, method in methods.items():
    start_time = time.time()
    if method_name in ['Z-Test', 'Chi-Square']:
        bigrams = method(bigrams_counts, total_words)
    else:
        bigrams = method(bigrams_counts)
    end_time = time.time()
    elapsed_time = end_time - start_time

    predicted_bigrams = [bigram for bigram in bigrams]
    accuracy = calculate_accuracy(predicted_bigrams, true_bigrams)

    results['Method'].append(method_name)
    results['Bigrams'].append(predicted_bigrams)
    results['Time (s)'].append(elapsed_time)
    results['Accuracy'].append(accuracy)

# Convert to a DataFrame and display the result
results_df = pd.DataFrame(results)
results_df.head()




  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero


Unnamed: 0,Method,Bigrams,Time (s),Accuracy
0,Frequency,"[100 100, 100 percent, 100 recomend, 100 recom...",0.065095,1.0
1,Mean-Variance,"[all the, and the, app and, app but, app ever,...",0.215017,1.0
2,Z-Test,"[able watch, about the, about this, account an...",48.810123,1.0
3,T-Test,[],932.013153,0.0
4,Chi-Square,"[000 000, 000 cap, 000 decibels, 000 dollars, ...",103.43158,1.0
