# Investigate the Data

In [10]:
import numpy as np
import pandas as pd
import scipy.stats as stats

In [2]:
rtr_data = pd.read_csv('rtr_clean.csv')

In [5]:
rtr_data.drop('Unnamed: 0', axis=1, inplace=True)
rtr_data.head()

Unnamed: 0,age,body_type,bust_size,category,fit,height,item_id,rating,rented_for,review_date,review_summary,review_text,size,user_id,weight,review_datetime,band_size,cup_size
0,28.0,hourglass,34d,romper,fit,68.0,2260466,10.0,vacation,"April 20, 2016",So many compliments!,An adorable romper! Belt and zipper were a lit...,14,420272,137.0,2016-04-20,34.0,d
1,36.0,straight & narrow,34b,gown,fit,66.0,153475,10.0,other,"June 18, 2013",I felt so glamourous!!!,I rented this dress for a photo shoot. The the...,12,273551,132.0,2013-06-18,34.0,b
2,116.0,,,sheath,fit,64.0,1063761,10.0,party,"December 14, 2015",It was a great time to celebrate the (almost) ...,This hugged in all the right places! It was a ...,4,360448,,2015-12-14,,
3,34.0,pear,34c,dress,fit,65.0,126335,8.0,formal affair,"February 12, 2014",Dress arrived on time and in perfect condition.,I rented this for my company's black tie award...,8,909926,135.0,2014-02-12,34.0,c
4,27.0,athletic,34b,gown,fit,69.0,616682,10.0,wedding,"September 26, 2016",Was in love with this dress !!!,I have always been petite in my upper body and...,12,151944,145.0,2016-09-26,34.0,b


In [6]:
rtr_data.describe()

Unnamed: 0,age,height,item_id,rating,size,user_id,weight,band_size
count,191584.0,191867.0,192544.0,192462.0,192544.0,192544.0,162562.0,174133.0
mean,33.871017,65.310621,1045684.0,9.092371,12.245175,499494.100149,137.391709,34.201271
std,8.058083,2.66348,805314.8,1.430044,8.494877,289059.719328,21.899967,1.745246
min,0.0,54.0,123373.0,2.0,0.0,9.0,50.0,28.0
25%,29.0,63.0,195076.0,8.0,8.0,250654.25,123.0,34.0
50%,32.0,65.0,948396.0,10.0,12.0,499419.0,135.0,34.0
75%,37.0,67.0,1678888.0,10.0,16.0,750974.0,148.0,36.0
max,117.0,78.0,2966087.0,10.0,58.0,999997.0,300.0,48.0


In [8]:
weight_above_mean = rtr_data.loc[rtr_data.weight>rtr_data.weight.median()]
weight_above_mean.describe()

Unnamed: 0,age,height,item_id,rating,size,user_id,weight,band_size
count,69984.0,69963.0,70154.0,70134.0,70154.0,70154.0,70154.0,65181.0
mean,34.538523,66.338608,1037025.0,9.056749,17.507997,498300.936112,156.242039,35.097467
std,8.092659,2.607464,795300.6,1.441387,7.64552,287163.815629,19.382975,1.687062
min,0.0,54.0,123373.0,2.0,0.0,9.0,136.0,28.0
25%,29.0,64.0,199231.0,8.0,12.0,254236.0,143.0,34.0
50%,33.0,66.0,947584.0,10.0,16.0,495914.5,150.0,36.0
75%,38.0,68.0,1661761.0,10.0,20.0,746802.5,165.0,36.0
max,117.0,76.0,2966087.0,10.0,58.0,999997.0,300.0,48.0


In [9]:
weight_below_mean = rtr_data.loc[rtr_data.weight<rtr_data.weight.median()]
weight_below_mean.describe()

Unnamed: 0,age,height,item_id,rating,size,user_id,weight,band_size
count,78546.0,78503.0,78703.0,78673.0,78703.0,78703.0,78703.0,73750.0
mean,33.583327,64.246487,1073752.0,9.122774,6.306011,497584.73752,121.005451,33.254373
std,8.148653,2.324129,813025.4,1.420702,4.081294,290958.564025,8.336919,1.252001
min,0.0,54.0,123373.0,2.0,0.0,25.0,50.0,28.0
25%,28.0,63.0,227442.0,8.0,4.0,243534.5,115.0,32.0
50%,32.0,64.0,985499.0,10.0,5.0,499398.0,122.0,34.0
75%,37.0,66.0,1697200.0,10.0,8.0,751499.5,128.0,34.0
max,117.0,76.0,2966087.0,10.0,51.0,999994.0,134.0,42.0


## Two sample $t$-tests

### samples based on weight strictly above or below the mean weight

In [12]:
# We are testing H_0: Mean rating for users with weight>mean weight is equal to mean rating for users with weight<mean weight
stats.ttest_ind(weight_below_mean.rating, weight_above_mean.rating, axis=0, equal_var=False, nan_policy='omit')

Ttest_indResult(statistic=8.880402099841763, pvalue=6.734837106616535e-19)

### samples based on top 500 items vs 500 items above $75^\textrm{th}$ percentile
(1499 items are above the $75^\textrm{th}$ percentile)

In [14]:
df = rtr_data.groupby(['item_id', 'user_id']).count()
df.loc[df.rating > 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,age,body_type,bust_size,category,fit,height,rating,rented_for,review_date,review_summary,review_text,size,weight,review_datetime,band_size,cup_size
item_id,user_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
123793,300403,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2,2
126335,148459,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
126335,750308,2,2,0,2,2,2,2,2,2,2,2,2,2,2,0,0
131533,805713,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
132135,720559,0,0,2,2,2,2,2,2,2,2,2,2,0,2,2,2
136110,306117,2,2,0,2,2,2,2,2,2,2,2,2,0,2,0,0
141761,19507,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2,2
144051,496417,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2,2
144727,119853,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
149655,456079,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2


In [18]:
rtr_data.duplicated().sum()

189

In [20]:
rtr_data[['item_id', 'user_id', 'review_summary']].duplicated().sum()

221