# Parkinson's Disease Dataset Wrangling

#### This code concerns biological markers of Parkinson's Disease

## Importing Packages

In [1]:
import pandas as pd
from scipy import stats

## Importing Data

In [2]:
# Import the Parkinson's CSV into a pandas DataFrame called 'parkinsons'
parkinsons = pd.read_csv('parkinsons.csv')

In [3]:
# Verify the shape of the dataset
parkinsons.shape

(195, 24)

#### There are 195 rows and 24 columns in this dataset

In [4]:
# Show the first five rows of the dataset
parkinsons.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [5]:
parkinsons.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 1

#### Most of the data is made up of numerical variables. Twenty-two are floats.

## Categorical Data

In [6]:
# Show the first ten rows of the name data column
parkinsons['name'].head(10)

0    phon_R01_S01_1
1    phon_R01_S01_2
2    phon_R01_S01_3
3    phon_R01_S01_4
4    phon_R01_S01_5
5    phon_R01_S01_6
6    phon_R01_S02_1
7    phon_R01_S02_2
8    phon_R01_S02_3
9    phon_R01_S02_4
Name: name, dtype: object

In [7]:
parkinsons.status.unique()

array([1, 0])

#### The names in this series object are merely identifiers, and thus, additional analysis on them would be unhelpful. The only binary variable in this dataset is status, but even if this numerical column was renamed to be categorical, there is no other categorical variable to make a cross-tabulation with.

In [8]:
parkinsons.status.value_counts(normalize=True) * 100

status
1    75.384615
0    24.615385
Name: proportion, dtype: float64

#### Roughly 75% of the participants in this dataset are from status 1, and 25% fall under status 0.

## Comparative Numerical Analysis


In [12]:
parkinsons.groupby('status').PPE.mean()

status
0    0.123017
1    0.233828
Name: PPE, dtype: float64

In [13]:
parkinsons.groupby('status').PPE.median()

status
0    0.115118
1    0.222716
Name: PPE, dtype: float64

The median of both status levels are lower than their means. This suggests the dataset is positiviely skewed.

## Numerical Data Analysis

In [14]:
parkinsons.spread1.mean()

np.float64(-5.684396743589745)

In [15]:
parkinsons.spread1.median()

np.float64(-5.720868)

In [17]:
parkinsons.spread1.max()

np.float64(-2.434031)

In [18]:
parkinsons.spread1.min()

np.float64(-7.964984)

In [19]:
parkinsons.spread1.max() - parkinsons.spread1.min()

np.float64(5.530953)

In [20]:
parkinsons.spread1.var(ddof=1)

np.float64(1.1885529681196454)

In [21]:
parkinsons.spread1.std(ddof=1)

np.float64(1.090207763740309)

In [22]:
stats.iqr(parkinsons.spread1)

np.float64(1.4039040000000007)

## Quiz Questions

#### 1. First, you should make sure it's isntalled in a virtual environment, then use the import pandas function as pd.

#### 2. You use the function pd.read_csv function to read CSV files.

#### 3. You display the first five rows by using the df.head() function 

#### 4. df.Age.mean()

#### 5. df.Salary.median()

#### 6. df.Score.std(ddof=1)

#### 7. df.info()

#### 8. df.Age.corr(df['Salary'])

#### 9. subset = df[df['Age'] > 30]

#### 10. df.Score.max() - df.Score.min()

#### 11. df.groupby('Department').Salary.mean()

#### 12. df.groupby('Department').JobTitle.value_counts()

#### 13. df.groupby('Department').Age.max()

#### 14. pd.crosstab(df.Department, df.JobTitle)

#### 15. pd.crosstab(df.Department, df.JobTitle, normalize=True) * 100