#### import libraries

In [1]:
import pandas as pd

## Data cleaning for Rosenberg results

In [91]:
rosenberg = pd.read_csv("../data/RSE/data_raw_tabulated.csv")
rosenberg.head()
# all types correct
# data looks clean and ready to use
# no NaNs, txt file in folder describes all variables and meaning of results


Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,gender,age,source,country
0,3,3,1,4,3,4,3,2,3,3,1,40,1,US
1,4,4,1,3,1,3,3,2,3,2,1,36,1,US
2,2,3,2,3,3,3,2,3,3,3,2,22,1,US
3,4,3,2,3,2,3,2,3,3,3,1,31,1,US
4,4,4,1,4,1,4,4,1,1,1,1,30,1,EU


### Adding all scores, and reshaping the table to fit DB model

In [92]:
q_lst = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9", "Q10"]

# adding all quesiton results
# scale is  1 to 4 with 1 being the min and 4 the max
# the results should therefore go from 0 to 40
rosenberg["overall_score"] = rosenberg[q_lst].sum(axis=1)
rosenberg = rosenberg.drop(q_lst, axis=1)


In [93]:
rosenberg.head()

Unnamed: 0,gender,age,source,country,overall_score
0,1,40,1,US,29
1,1,36,1,US,26
2,2,22,1,US,27
3,1,31,1,US,28
4,1,30,1,EU,25


In [94]:
# information on source is redundant 
rosenberg = rosenberg.drop("source", axis=1)
rosenberg.head()

Unnamed: 0,gender,age,country,overall_score
0,1,40,US,29
1,1,36,US,26
2,2,22,US,27
3,1,31,US,28
4,1,30,EU,25


In [95]:
# age index will be changed to F, M for better uderstanding
# M=1, F=2, 0=3 (for other and no answer)

rosenberg["gender"] = rosenberg["gender"].replace([1], "M").replace([2], "F")
rosenberg.head()

Unnamed: 0,gender,age,country,overall_score
0,M,40,US,29
1,M,36,US,26
2,F,22,US,27
3,M,31,US,28
4,M,30,EU,25


In [102]:
# ordering of columns to match DB
rosenberg = rosenberg[["country", "age", "overall_score", "gender"]]

In [104]:
# checked for 0s in age - decided to leave them in as age in not as important as the score
rosenberg.loc[rosenberg["age"]==0]
# checked for 0s in all the scores (ie if total less than 10) overall score - deleted as its all about score and not that many present
rosenberg.loc[rosenberg["overall_score"]<10].index
empties_lst = [101,   509,  2186,  2204,  2382,  3371,  4004,  7442,  8465,
             8918,  9137,  9291, 11575, 12435, 12623, 12749, 12852, 13369,
            13697, 15671, 15906, 16049, 16060, 16111, 16283, 16467, 17678,
            18043, 18715, 20813, 20931, 23679, 23888, 27153, 27376, 28047,
            28808, 29328, 30988, 31456, 31541, 31889, 32470, 33246, 34274,
            35005, 36939, 39031, 39262, 40213, 40233, 40824, 41704, 42653,
            42812, 42948, 44338, 44622, 44720, 44745, 45026, 45157, 45164,
            45951, 46484]

In [101]:
rosenberg = rosenberg.drop(empties_lst, axis=0)
# check if any left with:
# rosenberg.loc[rosenberg["overall_score"]<10]

In [103]:
rosenberg.describe()
# data looks good! :)

Unnamed: 0,age,overall_score
count,47909.0,47909.0
mean,44857.78,26.724916
std,9811181.0,2.78207
min,0.0,10.0
25%,18.0,25.0
50%,22.0,27.0
75%,32.0,28.0
max,2147484000.0,40.0
