In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer
import re
from tqdm import tqdm

# Initialize tqdm for progress tracking
tqdm.pandas()

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

# Load the CSV file
file_path = "cleaned_data.csv"
df = pd.read_csv(file_path)

# Extract text from the `Route_Description` column
df['Route_Description'] = df['Route_Description'].fillna('')

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

# Clean the `Route_Description` text
df['cleaned_text'] = df['Route_Description'].apply(clean_text)

# Extract bigrams (two-word phrases)
vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=500)
bigrams = vectorizer.fit_transform(df['cleaned_text'])

# Get the most common bigrams
bigram_counts = pd.DataFrame(bigrams.toarray(), columns=vectorizer.get_feature_names_out())

# Add sentiment scores for each row
df['sentiment_score'] = df['Route_Description'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Updated find_related_bigram function with tqdm
def find_related_bigram(row, bigram_counts, bigram_names):
    row_bigrams = bigram_counts.loc[row.name]
    matching_bigrams = [bigram_names[i] for i in range(len(row_bigrams)) if row_bigrams[i] > 0]
    
    if matching_bigrams:
        # If there are matching bigrams, choose the most common
        return matching_bigrams[0]
    else:
        # If no matching bigrams, choose one with the closest sentiment score
        bigram_sentiments = [
            (bigram, sia.polarity_scores(bigram)['compound'])
            for bigram in bigram_names
        ]
        closest_bigram = min(bigram_sentiments, key=lambda x: abs(x[1] - row['sentiment_score']))
        return closest_bigram[0]

# Apply the function with progress tracking
print("Processing bigrams for each row...")
df['Features'] = df.progress_apply(find_related_bigram, axis=1, bigram_counts=bigram_counts, bigram_names=bigram_names)

# Save the updated dataset
df.to_csv("cleaned_data.csv", index=False)
print("Features column updated with related bigrams and saved back to cleaned_data.csv.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


KeyboardInterrupt: 

In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer
import re
from tqdm import tqdm

# Initialize tqdm for progress tracking
tqdm.pandas()

# Download required NLTK resources
print("Downloading NLTK resources...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('vader_lexicon', quiet=True)
print("NLTK resources downloaded.")

# Load the CSV file
print("Loading dataset...")
file_path = "cleaned_data.csv"
df = pd.read_csv(file_path)
print(f"Dataset loaded with {len(df)} rows.")

# Extract text from the `Route_Description` column
print("Filling missing Route_Description values...")
df['Route_Description'] = df['Route_Description'].fillna('')
print("Missing values filled.")

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

# Clean the `Route_Description` text with progress bar
print("Cleaning Route_Description text...")
df['cleaned_text'] = df['Route_Description'].progress_apply(clean_text)
print("Text cleaning completed.")

# Extract bigrams (two-word phrases)
print("Extracting bigrams...")
vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=500)
bigrams = vectorizer.fit_transform(tqdm(df['cleaned_text']))
bigram_counts = pd.DataFrame(bigrams.toarray(), columns=vectorizer.get_feature_names_out())
print("Bigrams extracted.")

# Add sentiment scores for each row with progress bar
print("Calculating sentiment scores...")
df['sentiment_score'] = df['Route_Description'].progress_apply(lambda x: sia.polarity_scores(x)['compound'])
print("Sentiment scores calculated.")

# Updated find_related_bigram function
def find_related_bigram(row, bigram_counts, bigram_names):
    row_bigrams = bigram_counts.loc[row.name]
    matching_bigrams = [bigram_names[i] for i in range(len(row_bigrams)) if row_bigrams[i] > 0]
    
    if matching_bigrams:
        # If there are matching bigrams, choose the most common
        return matching_bigrams[0]
    else:
        # If no matching bigrams, choose one with the closest sentiment score
        bigram_sentiments = [
            (bigram, sia.polarity_scores(bigram)['compound'])
            for bigram in bigram_names
        ]
        closest_bigram = min(bigram_sentiments, key=lambda x: abs(x[1] - row['sentiment_score']))
        return closest_bigram[0]

# Apply the function with progress tracking
print("Assigning related bigrams to Features column...")
bigram_names = bigram_counts.columns
df['Features'] = df.progress_apply(find_related_bigram, axis=1, bigram_counts=bigram_counts, bigram_names=bigram_names)
print("Features column updated.")

# Save the updated dataset
print("Saving updated dataset...")
df.to_csv("cleaned_data.csv", index=False)
print("Updated dataset saved to cleaned_data.csv.")


Downloading NLTK resources...
NLTK resources downloaded.
Loading dataset...
Dataset loaded with 97090 rows.
Filling missing Route_Description values...
Missing values filled.
Cleaning Route_Description text...


  2%|▏         | 1463/97090 [00:17<19:07, 83.32it/s] 


KeyboardInterrupt: 

In [5]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer
import re
from tqdm import tqdm

# Initialize tqdm for progress tracking
tqdm.pandas()

# Download required NLTK resources
print("Downloading NLTK resources...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('vader_lexicon', quiet=True)
print("NLTK resources downloaded.")

# Step 1: Clean the `Route_Description` text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

print("Loading dataset...")
df = pd.read_csv("cleaned_data.csv")
print(f"Dataset loaded with {len(df)} rows.")

print("Cleaning Route_Description text...")
df['Route_Description'] = df['Route_Description'].fillna('')
df['cleaned_text'] = df['Route_Description'].progress_apply(clean_text)
df.to_csv("step1_cleaned_text.csv", index=False)
print("Step 1 completed: Cleaned text saved to step1_cleaned_text.csv.")

Downloading NLTK resources...
NLTK resources downloaded.
Loading dataset...
Dataset loaded with 97090 rows.
Cleaning Route_Description text...


100%|██████████| 97090/97090 [38:36<00:00, 41.92it/s]  


Step 1 completed: Cleaned text saved to step1_cleaned_text.csv.


In [7]:
print("Loading dataset with cleaned text...")
df = pd.read_csv("step1_cleaned_text.csv")

print("Ensuring cleaned_text column has no NaN values...")
# Replace any remaining NaN values with empty strings
df['cleaned_text'] = df['cleaned_text'].fillna('')

# Ensure all entries are strings
df['cleaned_text'] = df['cleaned_text'].astype(str)

print("Extracting bigrams...")
vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=500)

# Fit-transform the cleaned text
bigrams = vectorizer.fit_transform(tqdm(df['cleaned_text']))

# Save the bigram counts to a DataFrame
bigram_counts = pd.DataFrame(bigrams.toarray(), columns=vectorizer.get_feature_names_out())
bigram_counts.to_csv("step2_bigrams.csv", index=False)
df.to_csv("step2_dataset_with_bigrams.csv", index=False)

print("Step 2 completed: Bigrams saved to step2_bigrams.csv and dataset to step2_dataset_with_bigrams.csv.")

Loading dataset with cleaned text...
Ensuring cleaned_text column has no NaN values...
Extracting bigrams...


100%|██████████| 97090/97090 [00:03<00:00, 26047.95it/s]


Step 2 completed: Bigrams saved to step2_bigrams.csv and dataset to step2_dataset_with_bigrams.csv.


In [9]:
print("Loading dataset with bigrams...")
df = pd.read_csv("step2_dataset_with_bigrams.csv")
sia = SentimentIntensityAnalyzer()

# Ensure Route_Description column has no NaN values or invalid types
print("Ensuring Route_Description column is clean...")
df['Route_Description'] = df['Route_Description'].fillna("").astype(str)

# Calculate sentiment scores
print("Calculating sentiment scores...")
df['sentiment_score'] = df['Route_Description'].progress_apply(lambda x: sia.polarity_scores(x)['compound'])

# Save results
df.to_csv("step3_sentiment_scores.csv", index=False)
print("Step 3 completed: Sentiment scores saved to step3_sentiment_scores.csv.")


Loading dataset with bigrams...
Ensuring Route_Description column is clean...
Calculating sentiment scores...


100%|██████████| 97090/97090 [00:38<00:00, 2522.26it/s]


Step 3 completed: Sentiment scores saved to step3_sentiment_scores.csv.


In [17]:
# Step 4: Apply the function to find the most related bigram
def find_related_bigram(row, bigram_counts, bigram_names):
    row_bigrams = bigram_counts.loc[row.name]
    matching_bigrams = [bigram_names[i] for i in range(len(row_bigrams)) if row_bigrams[i] > 0]
    
    if matching_bigrams:
        return matching_bigrams[0]
    else:
        bigram_sentiments = [
            (bigram, sia.polarity_scores(bigram)['compound'])
            for bigram in bigram_names
        ]
        closest_bigram = min(bigram_sentiments, key=lambda x: abs(x[1] - row['sentiment_score']))
        return closest_bigram[0]

print("Loading dataset with sentiment scores...")
df = pd.read_csv("step3_sentiment_scores.csv")
bigram_counts = pd.read_csv("step2_bigrams.csv")
bigram_names = bigram_counts.columns

# Drop the "work way" column from bigram_counts
if "work way" in bigram_counts.columns:
    print("Dropping 'work way' column from bigram_counts...")
    bigram_counts = bigram_counts.drop(columns=["work way"])

# Drop the "along way" column from bigram_counts
if "along way" in bigram_counts.columns:
    print("Dropping 'along way' column from bigram_counts...")
    bigram_counts = bigram_counts.drop(columns=["along way"])

# Get bigram names and exclude "work way"
bigram_names = [name for name in bigram_counts.columns if name != "work way" and name != "along way"]

print("Assigning related bigrams to Features column...")
df['Features'] = df.progress_apply(find_related_bigram, axis=1, bigram_counts=bigram_counts, bigram_names=bigram_names)
print("Features column updated.")

Loading dataset with sentiment scores...
Dropping 'work way' column from bigram_counts...
Dropping 'along way' column from bigram_counts...
Assigning related bigrams to Features column...


  matching_bigrams = [bigram_names[i] for i in range(len(row_bigrams)) if row_bigrams[i] > 0]
100%|██████████| 97090/97090 [07:22<00:00, 219.64it/s]

Features column updated.





In [19]:
# Step 5: Drop extra columns and save final dataset
df_original = pd.read_csv("cleaned_data.csv")
original_columns = df_original.columns  # Keep track of original columns
print("Dropping extra columns...")
df = df[original_columns]  # Keep only original columns
df.to_csv("cleaned_data_final2.csv", index=False)
print("Step 5 completed: Final dataset saved to cleaned_data_final.csv.")

Dropping extra columns...
Step 5 completed: Final dataset saved to cleaned_data_final.csv.
