In [93]:
#!pip install streamlit

In [94]:

import pandas as pd
import numpy as np
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [95]:
# Load the dataset from JSON file (JSON Lines format)
rent_the_runway_final_data = pd.read_json('renttherunway_final_data.json', lines=True)
rent_the_runway_final_data.head()
#rent_the_runway_final_data['review_summary'][rent_the_runway_final_data['review_summary'].notna()].head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
2,fit,360448,,1063761,,10.0,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116.0,"December 14, 2015"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"


In [96]:
rent_the_runway_final_data.columns = (
    rent_the_runway_final_data.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
)
rent_the_runway_final_data.columns

Index(['fit', 'user_id', 'bust_size', 'item_id', 'weight', 'rating',
       'rented_for', 'review_text', 'body_type', 'review_summary', 'category',
       'height', 'size', 'age', 'review_date'],
      dtype='object')

--- 

In [97]:
def parse_weight(w):
    if isinstance(w, str):
        return int(w.replace('lbs', ''))
    return w

def parse_height(h):
    if isinstance(h, str) and "'" in h:
        feet, inches = h.split("' ")
        inches = inches.replace('"', '')
        return (int(feet) * 12) + int(inches)
    return h

In [98]:
# Data cleaning and preparation for rent_the_runway_final_data

# 1. Remove duplicates
rent_the_runway_final_data = rent_the_runway_final_data.drop_duplicates()


# 2. Drop rows with missing essential values
essential_cols = ['item_id', 'review_text', 'rating']
rent_the_runway_final_data = rent_the_runway_final_data.dropna(subset=essential_cols)

# Apply the cleaning
rent_the_runway_final_data['weight_num'] = rent_the_runway_final_data['weight'].apply(parse_weight)
rent_the_runway_final_data['height_num'] = rent_the_runway_final_data['height'].apply(parse_height)

rent_the_runway_final_data['weight_num'] = pd.to_numeric(rent_the_runway_final_data['weight'].apply(parse_weight), errors='coerce')
rent_the_runway_final_data['height_num'] = pd.to_numeric(rent_the_runway_final_data['height'].apply(parse_height), errors='coerce')

# 3. Fill missing values in less critical columns
rent_the_runway_final_data['category'] = rent_the_runway_final_data['category'].fillna('unknown')
rent_the_runway_final_data['bust_size'] = rent_the_runway_final_data['bust_size'].fillna('unknown')
rent_the_runway_final_data['body_type'] = rent_the_runway_final_data['body_type'].fillna('unknown')
rent_the_runway_final_data['height'] = rent_the_runway_final_data['height'].fillna('unknown')
rent_the_runway_final_data['review_summary'] = rent_the_runway_final_data['review_summary'].fillna('unknown')
rent_the_runway_final_data['rented_for'] = rent_the_runway_final_data['rented_for'].fillna('unknown')

# Apply the cleaning
rent_the_runway_final_data['weight_num'] = rent_the_runway_final_data['weight'].apply(parse_weight)
rent_the_runway_final_data['height_num'] = rent_the_runway_final_data['height'].apply(parse_height)

# For numerical columns, fill missing values with median
for col in ['size', 'age']:
    rent_the_runway_final_data[col] = rent_the_runway_final_data[col].fillna(rent_the_runway_final_data[col].median())

# 4. Strip whitespace from string columns
string_cols = ['review_text', 'category', 'bust_size', 'body_type', 'height', 'weight', 'review_summary', 'rented_for']
for col in string_cols:
    rent_the_runway_final_data[col] = rent_the_runway_final_data[col].astype(str).str.strip()

# 5. Reset index after cleaning
rent_the_runway_final_data = rent_the_runway_final_data.reset_index(drop=True)

In [99]:
# Display cleaned dataframe summary
rent_the_runway_final_data.head()

Unnamed: 0,fit,user_id,bust_size,item_id,weight,rating,rented_for,review_text,body_type,review_summary,category,height,size,age,review_date,weight_num,height_num
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016",137.0,68
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013",132.0,66
2,fit,360448,unknown,1063761,,10.0,party,This hugged in all the right places! It was a ...,unknown,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116.0,"December 14, 2015",,64
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014",135.0,65
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016",145.0,69


--- 

# Data Visualization

In [100]:
rent_the_runway_visualization = rent_the_runway_final_data.copy()

---
### EDA for Rent The Runway

In [101]:
# Set a consistent template and color palette
plotly_template = "plotly_white"
color_sequence = px.colors.qualitative.Set2
font_family = "Arial"
fig_width = 1000
fig_height = 500

# Group 1: Reviews & Ratings

In [102]:
# 1. Prepare the Data
# Ensure date column is in datetime format for the time series plot
rent_the_runway_visualization['review_date'] = pd.to_datetime(rent_the_runway_visualization['review_date'])
# --- Visualization 1: Distribution of Ratings (Histogram) ---
fig1 = px.histogram(
    rent_the_runway_visualization, 
    x='rating', 
    nbins=10, 
    title='Distribution of Customer Ratings',
    labels={'rating': 'Rating (0-10)'},
    color_discrete_sequence=color_sequence,
    template=plotly_template
)
fig1.update_layout(
    width=fig_width,
    height=fig_height,
    font=dict(family=font_family, size=16),
    xaxis=dict(showgrid=True, gridcolor='lightgray'),
    yaxis=dict(showgrid=True, gridcolor='lightgray')
)
fig1.show()

# --- Visualization 2: Ratings by Occasion (Box Plot) ---
fig2 = px.box(
    rent_the_runway_visualization, 
    x='rented_for', 
    y='rating', 
    color='rented_for',
    title='Distribution of Ratings by Occasion',
    labels={'rented_for': 'Occasion', 'rating': 'Rating'},
    color_discrete_sequence=color_sequence,
    template=plotly_template
)
fig2.update_layout(
    width=fig_width,
    height=fig_height,
    font=dict(family=font_family, size=16),
    xaxis=dict(showgrid=True, gridcolor='lightgray'),
    yaxis=dict(showgrid=True, gridcolor='lightgray')
)
fig2.show()

# --- Visualization 3: Average Rating over Time (Line Chart) ---
# Aggregate data by month to smooth out the trend
df_monthly_rating = (
    rent_the_runway_visualization
    .set_index('review_date')['rating']
    .resample('M')
    .mean()
    .reset_index()
)
fig3 = px.line(
    df_monthly_rating, 
    x='review_date', 
    y='rating', 
    title='Average Rating Trend Over Time',
    markers=True,
    labels={'review_date': 'Date', 'rating': 'Average Rating'},
    color_discrete_sequence=color_sequence,
    template=plotly_template
)
fig3.update_layout(
    width=fig_width,
    height=fig_height,
    font=dict(family=font_family, size=16),
    xaxis=dict(showgrid=True, gridcolor='lightgray'),
    yaxis=dict(showgrid=True, gridcolor='lightgray')
)
fig3.show()


'M' is deprecated and will be removed in a future version, please use 'ME' instead.

