# Python Basics for Analytics

## Sorted() & zip()

In [46]:
a = [-1, -5, 3, -33]
b = ['x', 'y', 'z', 't']

print("-----sorted()------")
print(sorted(a))
print(sorted(a, key=abs, reverse=True))

print("-----zip()------")
z = list(zip(a, b))
print(z)

c, d = zip(*z)
print(c, d)

-----sorted()------
[-33, -5, -1, 3]
[-33, -5, 3, -1]
-----zip()------
[(-1, 'x'), (-5, 'y'), (3, 'z'), (-33, 't')]
(-1, -5, 3, -33) ('x', 'y', 'z', 't')


## Numpy

In [55]:
import numpy as np

print("-----rand()------")
print(np.random.rand())    # Generage a single random number in range [0,1)
print("----")
print(np.random.rand(3,2)) # Generate a matrix of random numbers in range [0,1) with shape (3,2)

print("-----randint()------")
print(np.random.randint(5, 15, 2))
print("----")
print(np.random.randint(5, 15, (3,2)))

print("-----choice()------")
array = np.array([1,2,3,4,5]) 
print(np.random.choice(array, 10, replace=True)) # replace=True means the same value can be sampled more than once
print("----")
print(np.random.choice(array, 3, replace=False)) # replace=False means the same value can’t be sampled more than once

print("-----shuffle()------")
np.random.shuffle(array)
print(array)

-----rand()------
0.7316961330873843
----
[[0.54876532 0.66840215]
 [0.98741047 0.33870392]
 [0.5936519  0.20722547]]
-----randint()------
[13 12]
----
[[ 6 12]
 [13  5]
 [11  8]]
-----choice()------
[4 3 2 1 5 4 2 4 4 2]
----
[1 3 4]
-----shuffle()------
[3 5 2 4 1]


## Scipy

In [8]:
from scipy import stats
import numpy as np

array_1 = np.array([1,2,3,4,5,6])
array_2 = array_1

print("----- stats.pearsonr() => Correlation ------")
print(stats.pearsonr(array_1, array_2), '\n') # return (Correlation, p-value)

print("----- Generating samples (des échantillons) ------")
# Generate 10 values randomly sampled from a normal distribution with mean = 0 and standard deviation (ecart-type) = 10
x = stats.norm.rvs(loc=0, scale=10, size=10)  
print(x, '\n')

print("----- Probability density function ------")
p1 = stats.norm.pdf(x=-100, loc=0, scale=10)  # Get probability of sampling a value of -100
p2 = stats.norm.pdf(x=0, loc=0, scale=10)     # Get probability of sampling a value of 0
print(p1, p2, '\n')

print("----- Cumulative distribution function ------")
p3 = stats.norm.cdf(x=0, loc=0, scale=10)     # Get probability of sampling a value less than or equal to 0
print(p3, '\n')

print("----- Calculating descriptive statistics ------")
# Calculate descriptive statistics for 500 data points sampled from normal distribution with mean 0 and standard deviation of 1
print(stats.describe(stats.norm.rvs(loc=0, scale=1, size=500)))

----- stats.pearsonr() => Correlation ------
(0.9999999999999999, 1.8488927466117464e-32) 

----- Generating samples (des échantillons) ------
[  0.93600722   9.64268932 -17.76904533  -1.31056124   1.57372967
  -4.9847607   -3.7819764   -8.05658937   2.77448078   2.28085689] 

----- Probability density function ------
7.69459862670642e-24 0.03989422804014327 

----- Cumulative distribution function ------
0.5 

----- Calculating descriptive statistics ------
DescribeResult(nobs=500, minmax=(-3.4581696251964007, 2.8466736796806162), mean=-0.0046552576858826565, variance=0.9765453476087166, skewness=-0.08368241334586721, kurtosis=0.06634940504223596)


## Pandas

In [6]:
import pandas as pd
names = ['age', 'workclass', 'fnlwgt', 'education', 'educationnum', 'maritalstatus', 'occupation', 'relationship', 'race',
        'sex', 'capitalgain', 'capitalloss', 'hoursperweek', 'nativecountry', 'label']
train_df = pd.read_csv("Data/adult.data", header=None, names=names)

print("----- statistics on data ------")
print(train_df.describe()['age'], '\n')

print("----- Finding the data types ------")
print(train_df.info(), '\n')

----- statistics on data ------
count    32561.000000
mean        38.581647
std         13.640433
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: age, dtype: float64 

----- Finding the data types ------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            32561 non-null  int64 
 1   workclass      32561 non-null  object
 2   fnlwgt         32561 non-null  int64 
 3   education      32561 non-null  object
 4   educationnum   32561 non-null  int64 
 5   maritalstatus  32561 non-null  object
 6   occupation     32561 non-null  object
 7   relationship   32561 non-null  object
 8   race           32561 non-null  object
 9   sex            32561 non-null  object
 10  capitalgain    32561 non-null  int64 
 11  capitalloss    32561 non-null  int64 
 12  hoursperweek

In [34]:
print("----- Converting data types ------")
# to_numeric()    to_datetime()    to_string()
train_df['age'] = train_df['age'].astype(str)
print(type(train_df['age'][0]), '\n')

print("----- Unique values ------")
print(train_df['relationship'].unique())
print(train_df['relationship'].value_counts(), '\n')

print("----- Grouping the data ------")
print(train_df.groupby('relationship')['label'].value_counts(normalize=True), '\n')
print(train_df.groupby(['workclass'])['hoursperweek'].mean(), '\n')

print("----- Finding the correlation ------")
print(train_df.corr())

----- Converting data types ------
<class 'str'> 

----- Unique values ------
[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
 Husband           13193
 Not-in-family      8305
 Own-child          5068
 Unmarried          3446
 Wife               1568
 Other-relative      981
Name: relationship, dtype: int64 

----- Grouping the data ------
relationship     label 
 Husband          <=50K    0.551429
                  >50K     0.448571
 Not-in-family    <=50K    0.896930
                  >50K     0.103070
 Other-relative   <=50K    0.962283
                  >50K     0.037717
 Own-child        <=50K    0.986780
                  >50K     0.013220
 Unmarried        <=50K    0.936738
                  >50K     0.063262
 Wife             <=50K    0.524872
                  >50K     0.475128
Name: label, dtype: float64 

workclass
 ?                   31.919390
 Federal-gov         41.379167
 Local-gov           40.982800
 Never-worked        28.428571
 Pr