In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import re
from collections import Counter

# --- NLTK Setup (If you get a LookupError, run these once) ---
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# --- Configuration ---
INPUT_FILE = "raw_data_essay_set1.xlsx"
SCORE_FOR_COMMON_WORDS = 12
ESSAY_COLUMN = 'essay' # Column containing the essay content

# Initialize NLTK components and punctuation translator
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
translator = str.maketrans('', '', string.punctuation)

def preprocess_and_tokenize(text):
    """
    Performs all required preprocessing steps: 
    1. Case handling (selective lowercase, remove @).
    2. Punctuation removal.
    3. Special token grouping (e.g., CAPS1 -> CAPS).
    4. Lemmatization and stopword removal.
    """
    if pd.isna(text) or not text:
        return []
    
    # 1. Apply selective lowercase (keep @-words' original case)
    raw_tokens = str(text).split() 
    cased_tokens = [word if word.startswith('@') else word.lower() for word in raw_tokens]
    
    # 2. Join for Punctuation Removal
    joined_string = ' '.join(cased_tokens)
    clean_string = joined_string.translate(translator)
    clean_tokens = clean_string.split()
    
    processed_tokens = []
    
    for word in clean_tokens:
        
        # 3. Categorization/Grouping Logic (CAPS1 -> CAPS)
        category_match = re.match(r'([A-Z]+)\d*$', word)
        
        if category_match:
            # If it's an uppercase category token (like CAPS1), strip numbers
            lemmatized_word = category_match.group(1)
        else:
            # If it's a regular lowercase word, apply standard lemmatization
            lemmatized_word = lemmatizer.lemmatize(word)
            
        # 4. Stopword Removal
        if lemmatized_word and lemmatized_word not in stop_words:
            processed_tokens.append(lemmatized_word)
            
    return processed_tokens

def find_common_words(df):
    """
    Finds and prints ALL unique processed words from essays with the highest score.
    """
    # 1. Apply preprocessing to the essay column
    df['processed_tokens'] = df[ESSAY_COLUMN].apply(preprocess_and_tokenize)
    
    # 2. Filter for high-score essays
    high_score_tokens = df[df['domain1_score'] == SCORE_FOR_COMMON_WORDS]['processed_tokens']

    # 3. Aggregate all tokens
    all_high_score_tokens = [token for sublist in high_score_tokens for token in sublist]

    # 4. Find ALL unique words (the "common words" for feature creation)
    word_counts = Counter(all_high_score_tokens)
    common_words = list(word_counts.keys())
    
    print("--- Common Words List (Features) ---")
    print(f"Total unique words found in essays with score {SCORE_FOR_COMMON_WORDS}: {len(common_words)}\n")
    print(common_words)

# --- Main Execution Block ---
try:
    df = pd.read_excel(INPUT_FILE)
    print(f"Successfully loaded data from: {INPUT_FILE}\n")

    find_common_words(df)

except FileNotFoundError:
    print(f"\n❌ Error: The input file '{INPUT_FILE}' was not found.")
    print("Please make sure the file is in the correct location.")
except Exception as e:
    print(f"\n❌ An unexpected error occurred: {e}")

Successfully loaded data from: raw_data_essay_set1.xlsx


❌ An unexpected error occurred: ('Lengths must match to compare', (141,), (6,))


In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import re
from collections import Counter

# --- NLTK Setup (If you get a LookupError, run these once) ---
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# --- Configuration ---
INPUT_FILE = "raw_data_essay_set1.xlsx"
# MODIFICATION: SCORE_FOR_COMMON_WORDS is correctly defined as a list.
SCORE_FOR_COMMON_WORDS = [12, 10, 8, 6, 4, 2] 
ESSAY_COLUMN = 'essay' # Column containing the essay content

# Initialize NLTK components and punctuation translator
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
translator = str.maketrans('', '', string.punctuation)

def preprocess_and_tokenize(text):
    """
    Performs all required preprocessing steps: 
    1. Case handling (selective lowercase, remove @).
    2. Punctuation removal.
    3. Special token grouping (e.g., CAPS1 -> CAPS).
    4. Lemmatization and stopword removal.
    """
    if pd.isna(text) or not text:
        return []
    
    # 1. Apply selective lowercase (keep @-words' original case)
    raw_tokens = str(text).split() 
    # Logic: @-words have @ removed and case preserved; others are lowercased.
    cased_tokens = [word[1:] if word.startswith('@') else word.lower() for word in raw_tokens] 
    
    # 2. Join for Punctuation Removal
    joined_string = ' '.join(cased_tokens)
    clean_string = joined_string.translate(translator)
    clean_tokens = clean_string.split()
    
    processed_tokens = []
    
    for word in clean_tokens:
        
        # 3. Categorization/Grouping Logic (CAPS1 -> CAPS)
        category_match = re.match(r'([A-Z]+)\d*$', word)
        
        if category_match:
            # If it's an uppercase category token, strip numbers
            lemmatized_word = category_match.group(1)
        else:
            # If it's a regular lowercase word, apply standard lemmatization
            lemmatized_word = lemmatizer.lemmatize(word)
            
        # 4. Stopword Removal
        if lemmatized_word and lemmatized_word not in stop_words:
            processed_tokens.append(lemmatized_word)
            
    return processed_tokens

def find_common_words(df, target_scores):
    """
    MODIFIED to iterate over a list of scores and print the unique words for each group.
    """
    # 1. Apply preprocessing once to the essay column
    df['processed_tokens'] = df[ESSAY_COLUMN].apply(preprocess_and_tokenize)
    
    print("\n--- Common Words Analysis by Score Group ---")
    
    for score in target_scores:
        # 2. Filter for essays with the current score
        score_tokens = df[df['domain1_score'] == score]['processed_tokens']

        # 3. Aggregate all tokens for the current score
        all_score_tokens = [token for sublist in score_tokens for token in sublist]

        # 4. Find ALL unique words
        word_counts = Counter(all_score_tokens)
        common_words = list(word_counts.keys())
        
        # 5. Print results
        print(f"\n--- Score Group: {score} ---")
        print(f"Total essays in group: {len(score_tokens)}")
        print(f"Total unique words found: {len(common_words)}")
        # Optionally, print the top 10 most frequent words instead of ALL words
        top_frequent = word_counts.most_common(10) 
        print(f"Top 10 Most Frequent Words: {top_frequent}")
        # If you still want ALL words, uncomment the line below:
        # print(common_words)

# --- Main Execution Block ---
try:
    df = pd.read_excel(INPUT_FILE)
    print(f"Successfully loaded data from: {INPUT_FILE}\n")

    # Pass the list of scores to the modified function
    find_common_words(df, SCORE_FOR_COMMON_WORDS)

except FileNotFoundError:
    print(f"\n❌ Error: The input file '{INPUT_FILE}' was not found.")
    print("Please make sure the file is in the correct location.")
except Exception as e:
    print(f"\n❌ An unexpected error occurred: {e}")

Successfully loaded data from: raw_data_essay_set1.xlsx


--- Common Words Analysis by Score Group ---

--- Score Group: 12 ---
Total essays in group: 12
Total unique words found: 1218
Top 10 Most Frequent Words: [('computer', 170), ('CAPS', 136), ('people', 53), ('society', 38), ('NUM', 33), ('LOCATION', 28), ('ORGANIZATION', 28), ('PERSON', 27), ('time', 27), ('get', 26)]

--- Score Group: 10 ---
Total essays in group: 24
Total unique words found: 1528
Top 10 Most Frequent Words: [('computer', 306), ('people', 190), ('CAPS', 168), ('time', 89), ('friend', 72), ('family', 62), ('dont', 49), ('go', 48), ('like', 47), ('also', 46)]

--- Score Group: 8 ---
Total essays in group: 39
Total unique words found: 1411
Top 10 Most Frequent Words: [('computer', 426), ('people', 201), ('CAPS', 115), ('time', 109), ('get', 93), ('thing', 86), ('family', 83), ('friend', 80), ('go', 73), ('also', 64)]

--- Score Group: 6 ---
Total essays in group: 39
Total unique words found: 937
Top 10 Most Frequen

In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import re
from collections import Counter

# --- NLTK Setup (If you get a LookupError, run these once) ---
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# --- Configuration ---
INPUT_FILE = "raw_data_essay_set1.xlsx"
ESSAY_COLUMN = 'essay' # Column containing the essay content

# Initialize NLTK components and punctuation translator
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
translator = str.maketrans('', '', string.punctuation)

def preprocess_and_tokenize(text):
    """
    Performs the full text cleaning pipeline (case handling, punctuation removal, 
    special token grouping, lemmatization, and stopword removal).
    """
    if pd.isna(text) or not text:
        return []
    
    # 1. Apply selective lowercase (keep @-words' original case)
    raw_tokens = str(text).split() 
    cased_tokens = [word[1:] if word.startswith('@') else word.lower() for word in raw_tokens] 
    
    # 2. Join for Punctuation Removal
    joined_string = ' '.join(cased_tokens)
    clean_string = joined_string.translate(translator)
    clean_tokens = clean_string.split()
    
    processed_tokens = []
    
    for word in clean_tokens:
        
        # 3. Categorization/Grouping Logic (CAPS1 -> CAPS)
        category_match = re.match(r'([A-Z]+)\d*$', word)
        
        if category_match:
            # If it's an uppercase category token, strip numbers
            lemmatized_word = category_match.group(1)
        else:
            # If it's a regular lowercase word, apply standard lemmatization
            lemmatized_word = lemmatizer.lemmatize(word)
            
        # 4. Stopword Removal
        if lemmatized_word and lemmatized_word not in stop_words:
            processed_tokens.append(lemmatized_word)
            
    return processed_tokens

def find_common_words(df):
    """
    Finds and prints ALL unique processed words (the entire vocabulary) 
    from the whole corpus.
    """
    # 1. Apply preprocessing once to the essay column
    df['processed_tokens'] = df[ESSAY_COLUMN].apply(preprocess_and_tokenize)
    
    # --- MODIFICATION: Aggregate ALL tokens from the entire DataFrame ---
    all_corpus_tokens = df['processed_tokens']

    # 2. Aggregate all tokens into a single list
    all_tokens_list = [token for sublist in all_corpus_tokens for token in sublist]

    # 3. Find ALL unique words (the entire vocabulary)
    word_counts = Counter(all_tokens_list)
    common_words = list(word_counts.keys())
    
    print("\n--- Common Words List (Entire Corpus Vocabulary) ---")
    print(f"Total essays processed: {len(df)}")
    print(f"Total unique words found in the entire corpus: {len(common_words)}\n")
    
    # Print the top 10 most frequent words for context
    top_frequent = word_counts.most_common(10)
    print(f"Top 10 Most Frequent Words: {top_frequent}")
    
    print("\nFull Vocabulary (Unique Words):")
    # Print the full list of unique words
    print(common_words)
    # ------------------------------------------------------------------

# --- Main Execution Block ---
try:
    df = pd.read_excel(INPUT_FILE)
    print(f"Successfully loaded data from: {INPUT_FILE}\n")

    find_common_words(df)

except FileNotFoundError:
    print(f"\n❌ Error: The input file '{INPUT_FILE}' was not found.")
    print("Please make sure the file is in the correct location.")
except Exception as e:
    print(f"\n❌ An unexpected error occurred: {e}")

Successfully loaded data from: raw_data_essay_set1.xlsx


--- Common Words List (Entire Corpus Vocabulary) ---
Total essays processed: 141
Total unique words found in the entire corpus: 3146

Top 10 Most Frequent Words: [('computer', 1286), ('people', 661), ('CAPS', 551), ('time', 284), ('get', 222), ('friend', 213), ('like', 206), ('family', 203), ('thing', 199), ('go', 185)]

Full Vocabulary (Unique Words):
['dear', 'CAPS', 'LOCATION', 'world', 'network', 'people', 'connect', 'learn', 'work', 'efficently', 'wonderful', 'computer', 'benefit', 'society', 'help', 'faraway', 'place', 'develop', 'handeye', 'coordination', 'get', 'done', 'faster', 'day', 'isolated', 'immediate', 'area', 'stuck', 'town', 'unaware', 'rest', 'unable', 'enjoy', 'beauty', 'send', 'starving', 'PERCENT', 'money', 'donated', 'ORGANIZATION', 'organization', 'assist', 'global', 'disaster', 'come', 'online', 'without', 'would', 'willing', 'even', 'know', 'give', 'community', 'ability', 'reach', 'farthest', 'part', 'g