<a href="https://colab.research.google.com/github/Alexwcjung/S24Corpus-final/blob/main/Corpus/NLTK_spokenwritten.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLTK analysis example: spoken vs. written comparison

In [None]:
!pip install nltk

# POS tagging using nltk and finding 'get + p.p.' and 'be+p.p.'




In [None]:
import pandas as pd
import requests
from io import StringIO
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

# Download necessary NLTK resources
nltk.download('punkt')  # Tokenizer model
nltk.download('averaged_perceptron_tagger')  # POS tagger model

In [None]:
# Function to read and process data from a URL
def read_and_process_data(url):
    response = requests.get(url)
    data = pd.read_csv(StringIO(response.text))
    combined_text = ' '.join(data['text'].astype(str))
    tokens = word_tokenize(combined_text)
    tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation
    return tokens

# URLs for spoken and written data
spoken_url = 'https://raw.githubusercontent.com/Alexwcjung/S24Corpus-final/main/Corpus/Cleantext0605.csv'  # Replace with your actual URL for spoken data
written_url = 'https://raw.githubusercontent.com/Alexwcjung/S24Corpus-final/main/Corpus/now_final.csv'  # Replace with your actual URL for written data

# Process spoken data
spoken_tokens = read_and_process_data(spoken_url)

# Process written data
written_tokens = read_and_process_data(written_url)


In [None]:
from nltk.tokenize import sent_tokenize
from nltk import pos_tag, word_tokenize
def fetch_text_from_url(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def count_passives(text):
    sentences = sent_tokenize(text)
    be_passives_count = 0
    get_passives_count = 0

    be_forms = ['be', 'is', 'am', 'are', 'was', 'were', 'been', 'being']

    for sentence in sentences:
        words = word_tokenize(sentence)
        tagged = pos_tag(words)

        # Identify 'be' followed by a past participle
        for i in range(len(tagged) - 1):
            word, tag = tagged[i]
            next_word, next_tag = tagged[i + 1]
            if word.lower() in be_forms and next_tag == 'VBN':
                be_passives_count += 1
            if word.lower() == 'get' and next_tag == 'VBN':
                get_passives_count += 1

    return be_passives_count, get_passives_count

# URLs to the spoken and written text files
spoken_text_url = 'https://raw.githubusercontent.com/Alexwcjung/S24Corpus-final/main/Corpus/Cleantext0605.csv'
written_text_url = 'https://raw.githubusercontent.com/Alexwcjung/S24Corpus-final/main/Corpus/now_final.csv'

# Fetch the content of the URLs
spoken_text = fetch_text_from_url(spoken_text_url)
written_text = fetch_text_from_url(written_text_url)

# Count passives
spoken_be_passives_count, spoken_get_passives_count = count_passives(spoken_text)
written_be_passives_count, written_get_passives_count = count_passives(written_text)

# Print the results
print("Number of 'be + past participle' tokens in spoken text:", spoken_be_passives_count)
print("Number of 'get + past participle' tokens in spoken text:", spoken_get_passives_count)
print("Number of 'be + past participle' tokens in written text:", written_be_passives_count)
print("Number of 'get + past participle' tokens in written text:", written_get_passives_count)


In [None]:
# Given counts
spoken_be_passives_count = 1794
spoken_get_passives_count = 128
written_be_passives_count = 13910
written_get_passives_count = 124

# Calculate total occurrences for each type of passive
total_be_passives = spoken_be_passives_count + written_be_passives_count
total_get_passives = spoken_get_passives_count + written_get_passives_count
total_passives = total_be_passives + total_get_passives

# Calculate total passives for spoken and written texts
total_spoken_passives = spoken_be_passives_count + spoken_get_passives_count
total_written_passives = written_be_passives_count + written_get_passives_count

# Calculate percentages of 'be passives' and 'get passives' in the use of 'total passives'
spoken_be_passives_percentage = (spoken_be_passives_count / total_spoken_passives) * 100 if total_spoken_passives > 0 else 0
spoken_get_passives_percentage = (spoken_get_passives_count / total_spoken_passives) * 100 if total_spoken_passives > 0 else 0

written_be_passives_percentage = (written_be_passives_count / total_written_passives) * 100 if total_written_passives > 0 else 0
written_get_passives_percentage = (written_get_passives_count / total_written_passives) * 100 if total_written_passives > 0 else 0

# Print the results
print("Total occurrences of 'be + past participle' in spoken text:", spoken_be_passives_count)
print("Total occurrences of 'get + past participle' in spoken text:", spoken_get_passives_count)
print("Total occurrences of 'be + past participle' in written text:", written_be_passives_count)
print("Total occurrences of 'get + past participle' in written text:", written_get_passives_count)

print("\nTotal occurrences of 'be + past participle':", total_be_passives)
print("Total occurrences of 'get + past participle':", total_get_passives)
print("Total occurrences of all passives:", total_passives)

print("\nSpoken text - Percentage of 'be + past participle':", spoken_be_passives_percentage)
print("Spoken text - Percentage of 'get + past participle':", spoken_get_passives_percentage)

print("\nWritten text - Percentage of 'be + past participle':", written_be_passives_percentage)
print("Written text - Percentage of 'get + past participle':", written_get_passives_percentage)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Data
data = {
    'Category': [
        'be + past participle in spoken text',
        'get + past participle in spoken text',
        'be + past participle in written text',
        'get + past participle in written text'
    ],
    'Occurrences': [1794, 128, 13910, 124],
    'Percentage': [93.34, 6.66, 99.12, 0.88]
}

# Create DataFrame
df = pd.DataFrame(data)

# Plotting the table
fig, ax = plt.subplots(figsize=(12, 4))  # Set the figure size
ax.axis('tight')
ax.axis('off')

# Create the table
table = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center')

# Style the table
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.2)

# Save the table as a PNG image
plt.savefig('passive_counts_table.png', bbox_inches='tight', pad_inches=0.1)

# Display the image
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Given counts
spoken_be_passives_count = 1794
spoken_get_passives_count = 128
written_be_passives_count = 13910
written_get_passives_count = 124

# Calculate total occurrences for each type of passive
total_be_passives = spoken_be_passives_count + written_be_passives_count
total_get_passives = spoken_get_passives_count + written_get_passives_count
total_passives = total_be_passives + total_get_passives

# Calculate total passives for spoken and written texts
total_spoken_passives = spoken_be_passives_count + spoken_get_passives_count
total_written_passives = written_be_passives_count + written_get_passives_count

# Calculate percentages of 'be passives' and 'get passives' in the use of 'total passives'
spoken_be_passives_percentage = (spoken_be_passives_count / total_spoken_passives) * 100 if total_spoken_passives > 0 else 0
spoken_get_passives_percentage = (spoken_get_passives_count / total_spoken_passives) * 100 if total_spoken_passives > 0 else 0

written_be_passives_percentage = (written_be_passives_count / total_written_passives) * 100 if total_written_passives > 0 else 0
written_get_passives_percentage = (written_get_passives_count / total_written_passives) * 100 if total_written_passives > 0 else 0

# Prepare data for visualization
labels = ['Spoken', 'Written']
be_passives = [spoken_be_passives_percentage, written_be_passives_percentage]
get_passives = [spoken_get_passives_percentage, written_get_passives_percentage]

# Create bar width
bar_width = 0.35

# Set position of bar on X axis
r1 = range(len(labels))
r2 = [x + bar_width for x in r1]

# Create bar plot
plt.figure(figsize=(10, 6))
plt.bar(r1, be_passives, color='b', width=bar_width, edgecolor='grey', label='be + past participle')
plt.bar(r2, get_passives, color='r', width=bar_width, edgecolor='grey', label='get + past participle')

# Add labels and titles
plt.xlabel('Text Type', fontweight='bold')
plt.ylabel('Percentage', fontweight='bold')
plt.title('Percentage of Passives in Spoken and Written Texts')
plt.xticks([r + bar_width/2 for r in range(len(labels))], labels)

# Add legend
plt.legend()

# Show the plot
plt.show()


In [None]:
import numpy as np
from scipy.stats import chi2_contingency

# Given counts
spoken_be_passives_count = 1794
spoken_get_passives_count = 128
written_be_passives_count = 13910
written_get_passives_count = 124

# Create a contingency table
observed = np.array([[spoken_be_passives_count, spoken_get_passives_count],
                     [written_be_passives_count, written_get_passives_count]])

# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(observed)

# Print the results
print("Chi-square statistic:", chi2)
print("P-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:\n", expected)

# Interpretation of the p-value
alpha = 0.05
if p < alpha:
    print("The difference in passive constructions between spoken and written texts is statistically significant (reject null hypothesis).")
else:
    print("The difference in passive constructions between spoken and written texts is not statistically significant (fail to reject null hypothesis).")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Chi-square test results
data = {
    'Statistic': ['Chi-square statistic', 'P-value', 'Degrees of freedom', 'Expected frequency (Spoken "be" + past participle)', 'Expected frequency (Spoken "get" + past participle)', 'Expected frequency (Written "be" + past participle)', 'Expected frequency (Written "get" + past participle)', 'Conclusion'],
    'Value': [359.143, '4.327e-80', 1, 1891.645, 30.355, 13812.355, 221.645, 'Statistically significant (reject null hypothesis)']
}

# Create DataFrame
df = pd.DataFrame(data)

# Plotting the table
fig, ax = plt.subplots(figsize=(12, 4))  # Set the figure size
ax.axis('tight')
ax.axis('off')

# Create the table
table = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center')

# Style the table
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.2)

# Save the table as a PNG image
plt.savefig('chi_square_results_table.png', bbox_inches='tight', pad_inches=0.1)

# Display the image
plt.show()



# **The End**