# Python Basics for Analytics

## Sorted() & zip()

In [1]:
a = [-1, -5, 3, -33]
b = ['x', 'y', 'z', 't']

print("-----sorted()------")
print(sorted(a))
print(sorted(a, key=abs, reverse=True))

print("-----zip()------")
z = list(zip(a, b))
print(z)

c, d = zip(*z)
print(c, d)

-----sorted()------
[-33, -5, -1, 3]
[-33, -5, 3, -1]
-----zip()------
[(-1, 'x'), (-5, 'y'), (3, 'z'), (-33, 't')]
(-1, -5, 3, -33) ('x', 'y', 'z', 't')


## Numpy

In [2]:
import numpy as np

print("-----rand()------")
print(np.random.rand())    # Generage a single random number in range [0,1)
print("----")
print(np.random.rand(3,2)) # Generate a matrix of random numbers in range [0,1) with shape (3,2)

print("-----randint()------")
print(np.random.randint(5, 15, 2))
print("----")
print(np.random.randint(5, 15, (3,2)))

print("-----choice()------")
array = np.array([1,2,3,4,5]) 
print(np.random.choice(array, 10, replace=True)) # replace=True means the same value can be sampled more than once
print("----")
print(np.random.choice(array, 3, replace=False)) # replace=False means the same value can’t be sampled more than once

print("-----shuffle()------")
np.random.shuffle(array)
print(array)

-----rand()------
0.08479533816050111
----
[[0.13409434 0.37143917]
 [0.89567935 0.488758  ]
 [0.69219938 0.16613224]]
-----randint()------
[6 7]
----
[[ 8 10]
 [ 6 11]
 [10  9]]
-----choice()------
[1 2 5 2 4 5 1 4 2 3]
----
[3 4 1]
-----shuffle()------
[1 3 2 4 5]


## Scipy

In [3]:
from scipy import stats
import numpy as np

array_1 = np.array([1,2,3,4,5,6])
array_2 = array_1

print("----- stats.pearsonr() => Correlation ------")
print(stats.pearsonr(array_1, array_2), '\n') # return (Correlation, p-value)

print("----- Generating samples (des échantillons) ------")
# Generate 10 values randomly sampled from a normal distribution with mean = 0 and standard deviation (ecart-type) = 10
x = stats.norm.rvs(loc=0, scale=10, size=10)  
print(x, '\n')

print("----- Probability density function ------")
p1 = stats.norm.pdf(x=-100, loc=0, scale=10)  # Get probability of sampling a value of -100
p2 = stats.norm.pdf(x=0, loc=0, scale=10)     # Get probability of sampling a value of 0
print(p1, p2, '\n')

print("----- Cumulative distribution function ------")
p3 = stats.norm.cdf(x=0, loc=0, scale=10)     # Get probability of sampling a value less than or equal to 0
print(p3, '\n')

print("----- Calculating descriptive statistics ------")
# Calculate descriptive statistics for 500 data points sampled from normal distribution with mean 0 and standard deviation of 1
print(stats.describe(stats.norm.rvs(loc=0, scale=1, size=500)))

----- stats.pearsonr() => Correlation ------
(0.9999999999999999, 1.8488927466117464e-32) 

----- Generating samples (des échantillons) ------
[ -6.5672749    5.07770582   7.13420108  -2.79244994  12.9020118
   1.65046179 -15.53955311   2.24221342  -2.53426133  12.16519682] 

----- Probability density function ------
7.69459862670642e-24 0.03989422804014327 

----- Cumulative distribution function ------
0.5 

----- Calculating descriptive statistics ------
DescribeResult(nobs=500, minmax=(-2.8471299794510663, 3.046757161348687), mean=0.02742068076091738, variance=0.9106867296858975, skewness=0.0709527736970233, kurtosis=-0.2018059641073635)


## Pandas

In [4]:
import pandas as pd
names = ['age', 'workclass', 'fnlwgt', 'education', 'educationnum', 'maritalstatus', 'occupation', 'relationship', 'race',
        'sex', 'capitalgain', 'capitalloss', 'hoursperweek', 'nativecountry', 'label']
train_df = pd.read_csv("Data/adult.data", header=None, names=names)

print("----- statistics on data ------")
print(train_df.describe()['age'], '\n')

print("----- Finding the data types ------")
print(train_df.info(), '\n')

----- statistics on data ------
count    32561.000000
mean        38.581647
std         13.640433
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: age, dtype: float64 

----- Finding the data types ------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            32561 non-null  int64 
 1   workclass      32561 non-null  object
 2   fnlwgt         32561 non-null  int64 
 3   education      32561 non-null  object
 4   educationnum   32561 non-null  int64 
 5   maritalstatus  32561 non-null  object
 6   occupation     32561 non-null  object
 7   relationship   32561 non-null  object
 8   race           32561 non-null  object
 9   sex            32561 non-null  object
 10  capitalgain    32561 non-null  int64 
 11  capitalloss    32561 non-null  int64 
 12  hoursperweek

In [5]:
print("----- Converting data types ------")
# to_numeric()    to_datetime()    to_string()
train_df['age'] = train_df['age'].astype(str)
print(type(train_df['age'][0]), '\n')

print("----- Unique values ------")
print(train_df['relationship'].unique())
print(train_df['relationship'].value_counts(), '\n')

print("----- Grouping the data ------")
print(train_df.groupby('relationship')['label'].value_counts(normalize=True), '\n')
print(train_df.groupby(['workclass'])['hoursperweek'].mean(), '\n')

print("----- Finding the correlation ------")
print(train_df.corr())

----- Converting data types ------
<class 'str'> 

----- Unique values ------
[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
 Husband           13193
 Not-in-family      8305
 Own-child          5068
 Unmarried          3446
 Wife               1568
 Other-relative      981
Name: relationship, dtype: int64 

----- Grouping the data ------
relationship     label 
 Husband          <=50K    0.551429
                  >50K     0.448571
 Not-in-family    <=50K    0.896930
                  >50K     0.103070
 Other-relative   <=50K    0.962283
                  >50K     0.037717
 Own-child        <=50K    0.986780
                  >50K     0.013220
 Unmarried        <=50K    0.936738
                  >50K     0.063262
 Wife             <=50K    0.524872
                  >50K     0.475128
Name: label, dtype: float64 

workclass
 ?                   31.919390
 Federal-gov         41.379167
 Local-gov           40.982800
 Never-worked        28.428571
 Pr

In [6]:
print("----- Pivot Table ------")
print(pd.pivot_table(train_df, values='hoursperweek', index=['relationship','workclass'], 
               columns=['label'], aggfunc=np.mean).round(2))

----- Pivot Table ------
label                               <=50K   >50K
relationship    workclass                       
 Husband         ?                  30.72  37.33
                 Federal-gov        42.34  43.05
                 Local-gov          41.40  44.56
                 Private            42.50  46.18
                 Self-emp-inc       48.29  50.49
                 Self-emp-not-inc   46.01  48.07
                 State-gov          38.67  45.17
                 Without-pay        34.25    NaN
 Not-in-family   ?                  31.29  39.44
                 Federal-gov        40.60  47.54
                 Local-gov          40.38  45.01
                 Never-worked       35.00    NaN
                 Private            40.20  47.03
                 Self-emp-inc       49.06  53.58
                 Self-emp-not-inc   41.53  45.02
                 State-gov          38.87  44.19
 Other-relative  ?                  29.10  40.00
                 Federal-gov        38.40  4

In [7]:
print("----- Cross Tab ------")
# Calculate the frequencies between label and relationship
print(pd.crosstab(train_df.relationship, train_df['label'], normalize=False))

----- Cross Tab ------
label             <=50K   >50K
relationship                  
 Husband           7275   5918
 Not-in-family     7449    856
 Other-relative     944     37
 Own-child         5001     67
 Unmarried         3228    218
 Wife               823    745


In [8]:
import pandas.util.testing as tm
import numpy as np
import pandas as pd

print("----- Create long dataframe ------")
def unpivot(frame):
    N, K = frame.shape
    data = {'date' : np.tile(np.asarray(frame.index), K),      # repeat indexes K times
            'variable' : np.asarray(frame.columns).repeat(N),  # repeat columns N times
            'value' : frame.values.ravel('F')
            }
    return pd.DataFrame(data)

df = unpivot(tm.makeTimeDataFrame(3))
print(df, '\n')

print("----- Pivot ------")
df_pivot = df.pivot(index='date', columns='variable', values='value')
print(df_pivot, '\n')

print("----- Unpivot ------")
print(df_pivot.unstack())

----- Create long dataframe ------
         date variable     value
0  2000-01-03        A  0.129016
1  2000-01-04        A  0.207200
2  2000-01-05        A  1.245801
3  2000-01-03        B  0.282468
4  2000-01-04        B  0.347059
5  2000-01-05        B -0.002431
6  2000-01-03        C  0.769712
7  2000-01-04        C  0.497418
8  2000-01-05        C -1.944693
9  2000-01-03        D -0.417650
10 2000-01-04        D  0.083522
11 2000-01-05        D -1.215114 

----- Pivot ------
variable           A         B         C         D
date                                              
2000-01-03  0.129016  0.282468  0.769712 -0.417650
2000-01-04  0.207200  0.347059  0.497418  0.083522
2000-01-05  1.245801 -0.002431 -1.944693 -1.215114 

----- Unpivot ------
variable  date      
A         2000-01-03    0.129016
          2000-01-04    0.207200
          2000-01-05    1.245801
B         2000-01-03    0.282468
          2000-01-04    0.347059
          2000-01-05   -0.002431
C         2000-01-

  """Entry point for launching an IPython kernel.
