In [1]:
import pandas as pd
import numpy as np

def load_and_preprocess(file_path):
    try:
        # Step 1: Load the dataset
        df = pd.read_csv(file_path)
        print("File successfully loaded.")

        # Step 2: Basic dataset information
        print("\n--- Dataset Overview ---")
        print(f"Shape: {df.shape}")
        print("Columns:", df.columns.tolist())
        print("\nData Types:\n", df.dtypes)
        print("\nMissing Values:\n", df.isnull().sum())

        # Step 3: Standardize column names
        df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
        print("\nStandardized Column Names:", df.columns.tolist())

        # Step 4: Handle missing values
        print("\n--- Handling Missing Values ---")
        missing_threshold = 0.3  # Drop columns with >30% missing data
        missing_cols = df.columns[df.isnull().mean() > missing_threshold]
        if len(missing_cols) > 0:
            print(f"Dropping columns with >{missing_threshold*100}% missing values: {missing_cols.tolist()}")
            df.drop(columns=missing_cols, inplace=True)
        else:
            print("No columns dropped for missing values.")

        # Fill remaining missing values
        df.fillna(method='ffill', inplace=True)  # Forward fill as default
        df.fillna(method='bfill', inplace=True)  # Backward fill for remaining
        print("Remaining missing values filled.")

        # Step 5: Remove duplicates
        print("\n--- Checking for Duplicates ---")
        duplicates = df.duplicated().sum()
        if duplicates > 0:
            print(f"Removing {duplicates} duplicate rows.")
            df = df.drop_duplicates()
        else:
            print("No duplicates found.")

        # Step 6: Detect and handle outliers (example for numerical columns)
        print("\n--- Handling Outliers ---")
        num_cols = df.select_dtypes(include=np.number).columns
        for col in num_cols:
            q1, q3 = df[col].quantile([0.25, 0.75])
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
            if outliers > 0:
                print(f"Column '{col}' has {outliers} outliers. Clipping values.")
                df[col] = np.clip(df[col], lower_bound, upper_bound)

        # Step 7: Provide final dataset overview
        print("\n--- Final Dataset Overview ---")
        print(f"Shape after preprocessing: {df.shape}")
        print("Sample Data:\n", df.head())

        # Step 8: Save cleaned data
        clean_file_path = file_path.replace('.csv', '_cleaned.csv')
        df.to_csv(clean_file_path, index=False)
        print(f"\nCleaned dataset saved to {clean_file_path}.")
        return df

    except FileNotFoundError:
        print("Error: File not found. Please ensure the file path is correct.")
    except pd.errors.EmptyDataError:
        print("Error: File is empty.")
    except pd.errors.ParserError:
        print("Error: Could not parse the file. Check file format.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Run the function with your file path
df_cleaned = load_and_preprocess('clothing_sampled.csv')


File successfully loaded.

--- Dataset Overview ---
Shape: (211876, 5)
Columns: ['reviewer_id', 'purchase_history', 'last_purchase', 'user', 'metadata']

Data Types:
 reviewer_id         object
purchase_history    object
last_purchase       object
user                object
metadata            object
dtype: object

Missing Values:
 reviewer_id         0
purchase_history    0
last_purchase       0
user                0
metadata            0
dtype: int64

Standardized Column Names: ['reviewer_id', 'purchase_history', 'last_purchase', 'user', 'metadata']

--- Handling Missing Values ---
No columns dropped for missing values.


  df.fillna(method='ffill', inplace=True)  # Forward fill as default
  df.fillna(method='bfill', inplace=True)  # Backward fill for remaining


Remaining missing values filled.

--- Checking for Duplicates ---
Removing 1519 duplicate rows.

--- Handling Outliers ---

--- Final Dataset Overview ---
Shape after preprocessing: (210357, 5)
Sample Data:
       reviewer_id                                   purchase_history  \
0  A1L3BAYQ7DZWZS  [{'item': {'asin': 'B014DUB312', 'title': "Fau...   
1  A2DXOK3HLBPZYG  [{'item': {'asin': 'B00NWA56L8', 'title': 'Har...   
2  A2FFXP452K3J8E  [{'item': {'asin': 'B00B9FKH94', 'title': "Emp...   
3  A1PPLJ50ZJN58T  [{'item': {'asin': 'B001V9LM4M', 'title': "Ame...   
4   AQ4LZBULIOAQI  [{'item': {'asin': 'B0001YR5AI', 'title': 'Dic...   

                                       last_purchase  \
0  {'reason': 'The customer likely needed replace...   
1  {'reason': 'The customer likely needed a new s...   
2  {'reason': 'Needed comfortable sneakers for ca...   
3  {'reason': 'The customer wanted a comfortable,...   
4  {'reason': 'The customer likely purchased a be...   

                      

In [2]:
import pandas as pd

# Save cleaned data for Shiny app
df_cleaned.to_csv('./clothing_sampled_cleaned.csv', index=False)
# Check columns in the dataset
print(df.columns)

# Select the column containing textual data for sentiment analysis
text_column = 'review_text'  # Replace with the actual column name in your dataset

NameError: name 'df' is not defined

In [None]:
import pandas as pd
import ast  # For safely evaluating stringified dictionaries
from textblob import TextBlob

# Sample DataFrame
data = {
    "reviewer_id": [
        "A1L3BAYQ7DZWZS", "A2DXOK3HLBPZYG", "A2FFXP452K3J8E", "A1PPLJ50ZJN58T", "AQ4LZBULIOAQI"
    ],
    "last_purchase": [
        "{'reason': 'The customer likely needed replacement parts for their device.'}",
        "{'reason': 'The customer likely needed a new style for summer.'}",
        "{'reason': 'Needed comfortable sneakers for casual wear.'}",
        "{'reason': 'The customer wanted a comfortable, stylish shoe for walking.'}",
        "{'reason': 'The customer likely purchased a beach hat for vacation.'}"
    ]
}

df = pd.DataFrame(data)

# Extract the 'reason' field
def extract_reason(purchase):
    try:
        purchase_dict = ast.literal_eval(purchase)  # Convert string to dictionary
        return purchase_dict.get('reason', None)
    except:
        return None

df['reason_text'] = df['last_purchase'].apply(extract_reason)

# Display extracted reasons
print(df[['reviewer_id', 'reason_text']])

In [22]:
# Define positive and negative keywords
positive_keywords = ["comfortable", "stylish", "new", "great", "perfect", "love", "excellent", "vacation", "enjoy"]
negative_keywords = ["needed", "replacement", "issue", "problem", "broken", "poor", "dissatisfied", "complaint"]

# Define a function to calculate keyword-based sentiment
def keyword_sentiment(text):
    if pd.isnull(text):  # Handle missing values
        return None
    text = text.lower()  # Convert text to lowercase for comparison
    positive_count = sum(word in text for word in positive_keywords)
    negative_count = sum(word in text for word in negative_keywords)
    score = positive_count - negative_count  # Positive if more positive keywords, negative if more negative keywords
    return score

In [23]:
# Combine keyword-based and TextBlob sentiment
def hybrid_sentiment(text):
    if pd.isnull(text):  # Handle missing values
        return None
    # Keyword-based sentiment score
    keyword_score = keyword_sentiment(text)
    # TextBlob sentiment score
    blob_score = TextBlob(text).sentiment.polarity
    # Combine scores (you can tune weights if needed)
    combined_score = blob_score + (0.5 * keyword_score)  # Give more weight to TextBlob or keywords as needed
    return combined_score

In [None]:
# Apply the hybrid sentiment scoring function
df['hybrid_sentiment_score'] = df['reason_text'].apply(hybrid_sentiment)

# Classify sentiment based on the hybrid score
def classify_hybrid_sentiment(score):
    if score is None:
        return None
    elif score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'

df['hybrid_sentiment_label'] = df['hybrid_sentiment_score'].apply(classify_hybrid_sentiment)

# Display results
print(df[['reviewer_id', 'reason_text', 'hybrid_sentiment_score', 'hybrid_sentiment_label']])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Countplot for sentiment labels
sns.countplot(data=df, x='hybrid_sentiment_label', palette='coolwarm')
plt.title("Distribution of Hybrid Sentiment Labels")
plt.show()

# Sentiment score distribution
sns.histplot(df['hybrid_sentiment_score'].dropna(), kde=True, color='purple')
plt.title("Hybrid Sentiment Score Distribution")
plt.xlabel("Hybrid Sentiment Score")
plt.ylabel("Frequency")
plt.show()

# Save results to a CSV file
df.to_csv('./clothing_sampled_with_hybrid_sentiment.csv', index=False)
print("Hybrid sentiment analysis results saved to clothing_sampled_with_hybrid_sentiment.csv")

In [None]:
# Define a function to calculate sentiment polarity
def get_sentiment(text):
    if pd.isnull(text):  # Handle missing values
        return None
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Apply sentiment analysis
df['sentiment_score'] = df['reason_text'].apply(get_sentiment)

# Classify sentiment
def classify_sentiment(score):
    if score is None:
        return None
    elif score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_label'] = df['sentiment_score'].apply(classify_sentiment)

# Display results
print(df[['reviewer_id', 'reason_text', 'sentiment_score', 'sentiment_label']])

In [None]:
df.to_csv('/content/clothing_sampled_with_sentiment.csv', index=False)
print("Sentiment analysis results saved to clothing_sampled_with_sentiment.csv")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Countplot for sentiment labels
sns.countplot(data=df, x='sentiment_label', palette='coolwarm')
plt.title("Distribution of Sentiment Labels")
plt.show()

# Sentiment score distribution
sns.histplot(df['sentiment_score'].dropna(), kde=True, color='blue')
plt.title("Sentiment Score Distribution")
plt.xlabel("Sentiment Score")
plt.ylabel("Frequency")
plt.show()

In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from textblob import TextBlob
import ast

# Step 1: Load the cleaned dataset
file_path = './clothing_sampled_cleaned.csv'  # Adjust the path if necessary
df = pd.read_csv(file_path)

# Step 2: Extract 'reason_text' from the 'last_purchase' column
def extract_reason(purchase):
    try:
        purchase_dict = ast.literal_eval(purchase)  # Convert string to dictionary
        return purchase_dict.get('reason', None)
    except:
        return None

df['reason_text'] = df['last_purchase'].apply(extract_reason)

# Step 3: Generate sentiment labels using TextBlob
def get_sentiment(text):
    if pd.isnull(text):  # Handle missing values
        return None
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_label'] = df['reason_text'].apply(get_sentiment)

# Check if sentiment labels were generated
print("\nSample Data with Sentiment Labels:")
print(df[['reason_text', 'sentiment_label']].head())

# Step 4: Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=500)
X_text = vectorizer.fit_transform(df['reason_text'].fillna('')).toarray()

# Step 5: Prepare data for classification
X = X_text  # Features (add more if needed)
y = df['sentiment_label']  # Target variable

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train a classification model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Step 7: Make predictions and evaluate the model
y_pred = clf.predict(X_test)

# Print evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy:", accuracy_score(y_test, y_pred))

# Save the updated DataFrame with sentiment labels
output_file = './clothing_sampled_with_sentiment.csv'
df.to_csv(output_file, index=False)
print(f"\nResults saved to {output_file}")


Sample Data with Sentiment Labels:
                                         reason_text sentiment_label
0  The customer likely needed replacement lenses ...         neutral
1  The customer likely needed a new swimsuit for ...        positive
2  Needed comfortable sneakers for casual wear bu...        negative
3  The customer wanted a comfortable, high-qualit...        negative
4  The customer likely purchased a beanie as a gi...         neutral

Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.79      0.84      6708
     neutral       0.84      0.93      0.88      8914
    positive       0.95      0.94      0.95     26450

    accuracy                           0.92     42072
   macro avg       0.90      0.89      0.89     42072
weighted avg       0.92      0.92      0.92     42072


Accuracy: 0.9164765164479939

Results saved to ./clothing_sampled_with_sentiment.csv


In [8]:
df.head()

Unnamed: 0,reviewer_id,purchase_history,last_purchase,user,metadata,reason_text,sentiment_label,sentiment_numeric
0,A1L3BAYQ7DZWZS,"[{'item': {'asin': 'B014DUB312', 'title': ""Fau...",{'reason': 'The customer likely needed replace...,"{'profile': 'Fashionable, Value-focused Shoppe...","{'example': {'richness': 3, 'confidence': 5}, ...",The customer likely needed replacement lenses ...,neutral,1
1,A2DXOK3HLBPZYG,"[{'item': {'asin': 'B00NWA56L8', 'title': 'Har...",{'reason': 'The customer likely needed a new s...,"{'profile': 'Recreational, Comfortable, Casual...","{'example': {'richness': 5, 'confidence': 7}, ...",The customer likely needed a new swimsuit for ...,positive,2
2,A2FFXP452K3J8E,"[{'item': {'asin': 'B00B9FKH94', 'title': ""Emp...",{'reason': 'Needed comfortable sneakers for ca...,"{'profile': 'Casual, Comfortable, Male, Shoppe...","{'example': {'richness': 3, 'confidence': 6}, ...",Needed comfortable sneakers for casual wear bu...,negative,0
3,A1PPLJ50ZJN58T,"[{'item': {'asin': 'B001V9LM4M', 'title': ""Ame...","{'reason': 'The customer wanted a comfortable,...","{'profile': 'Male, Stylish, Comfortable, Boot ...","{'example': {'richness': 5, 'confidence': 5}, ...","The customer wanted a comfortable, high-qualit...",negative,0
4,AQ4LZBULIOAQI,"[{'item': {'asin': 'B0001YR5AI', 'title': 'Dic...",{'reason': 'The customer likely purchased a be...,"{'profile': 'Practical, functional, warm weath...","{'example': {'richness': 5, 'confidence': 5}, ...",The customer likely purchased a beanie as a gi...,neutral,1


In [6]:
#save file
import joblib

# Save the trained model using joblib
joblib.dump(clf, 'sentiment_classification.joblib')

['sentiment_classification.joblib']

In [5]:
#XGBoost

# Import necessary libraries
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from textblob import TextBlob
import xgboost as xgb

# Step 1: Load the cleaned dataset
file_path = './clothing_sampled_cleaned.csv'  # Adjust the path if necessary
df = pd.read_csv(file_path)

# Step 2: Extract 'reason_text' from the 'last_purchase' column
def extract_reason(purchase):
    try:
        purchase_dict = ast.literal_eval(purchase)  # Convert string to dictionary
        return purchase_dict.get('reason', None)
    except:
        return None

df['reason_text'] = df['last_purchase'].apply(extract_reason)

# Step 3: Generate sentiment labels using TextBlob
def get_sentiment(text):
    if pd.isnull(text):  # Handle missing values
        return None
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_label'] = df['reason_text'].apply(get_sentiment)

# Check if sentiment labels were generated
print("\nSample Data with Sentiment Labels:")
print(df[['reason_text', 'sentiment_label']].head())

# Step 4: Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=500)
X_text = vectorizer.fit_transform(df['reason_text'].fillna('')).toarray()

# Encode target labels to numeric format for XGBoost
label_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
df['sentiment_numeric'] = df['sentiment_label'].map(label_mapping)

# Step 5: Prepare data for classification
X = X_text  # Features (add more if needed)
y = df['sentiment_numeric']  # Numeric target variable

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train an XGBoost classification model
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  # Multiclass classification
    num_class=3,  # Three classes: positive, neutral, negative
    eval_metric='mlogloss',  # Log loss for multiclass
    use_label_encoder=False,
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Step 7: Make predictions and evaluate the model
y_pred = xgb_model.predict(X_test)

# Convert numeric predictions back to labels for interpretation
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
y_test_labels = y_test.map(reverse_label_mapping)
y_pred_labels = pd.Series(y_pred).map(reverse_label_mapping)

# Print evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test_labels, y_pred_labels))
print("\nAccuracy:", accuracy_score(y_test_labels, y_pred_labels))

# Save the updated DataFrame with sentiment labels
output_file = './clothing_sampled_with_xgboost_sentiment.csv'
df.to_csv(output_file, index=False)
print(f"\nResults saved to {output_file}")


Sample Data with Sentiment Labels:
                                         reason_text sentiment_label
0  The customer likely needed replacement lenses ...         neutral
1  The customer likely needed a new swimsuit for ...        positive
2  Needed comfortable sneakers for casual wear bu...        negative
3  The customer wanted a comfortable, high-qualit...        negative
4  The customer likely purchased a beanie as a gi...         neutral


Parameters: { "use_label_encoder" } are not used.




Classification Report:
              precision    recall  f1-score   support

    negative       0.92      0.84      0.88      6708
     neutral       0.83      0.98      0.90      8914
    positive       0.98      0.94      0.96     26450

    accuracy                           0.93     42072
   macro avg       0.91      0.92      0.91     42072
weighted avg       0.94      0.93      0.93     42072


Accuracy: 0.9326155162578437

Results saved to ./clothing_sampled_with_xgboost_sentiment.csv


In [7]:
import joblib

# Save the trained model using joblib
joblib.dump(xgb_model, 'xgboost_sentiment_classification.joblib')

['xgboost_sentiment_classification.joblib']