# Introduction to Python - Exercises

In [1]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

In [2]:
cali_houses = fetch_california_housing()
cali_houses = pd.DataFrame(columns=cali_houses.feature_names, data=cali_houses.data)

In [3]:
# We have created a dataset and stored it in the variable cali_houses.
# Try to get an overview of the first 5 rows of the dataset
cali_houses.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [4]:
# Calculate the average of all the columns in the dataset
cali_houses.mean()


MedInc           3.870671
HouseAge        28.639486
AveRooms         5.429000
AveBedrms        1.096675
Population    1425.476744
AveOccup         3.070655
Latitude        35.631861
Longitude     -119.569704
dtype: float64

In [5]:
# Calculate the average of the HouseAge column.
cali_houses["HouseAge"].mean()

28.639486434108527

In [6]:
# Calculate the standard deviation of the Population column.
cali_houses["Population"].std()

1132.4621217653375

In [7]:
# Describe the main statistics of the dataset
cali_houses.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


# Build confidence intervals

## Exo 1 - Facebook
* Facebook would like to know the average time spent on their site. 
* To do so, they launched a study on 100 people and asked them how much time they spend on Facebook per day:

In [8]:
import pandas as pd

In [9]:
# Open the Hours spent on Facebook.xlsx file
df = pd.read_excel("assets\Python\Hours Spent on Facebook.xlsx")
df.head()

Unnamed: 0,Customer,Hours spent on Facebook on Mobile,Unnamed: 2,Unnamed: 3
0,1,4.092475,,2.978913
1,2,7.233371,,
2,3,7.455792,,
3,4,7.253503,,
4,5,2.249682,,


In [10]:
# Calculate the mean and standard deviation of the corresponding column.
MyMean = df["Hours spent on Facebook on Mobile"].mean()
MyStd = df["Hours spent on Facebook on Mobile"].std()
# MyStd = df.iloc[:,1].std()
print(f"Moyenne = {MyMean:.2f} et Ecart type = {MyStd:.2f}")


Moyenne = 4.85 et Ecart type = 2.98


$$E = t \frac{s}{\sqrt{n}}$$

In [11]:
# We would like to know the mean of the total population. Calculate the 95% confidence interval.
# 95% => t = 1.96
t = 1.96
n =len(df)
E = t * MyStd/n**.5
print(f"E                       : {E:.2f}")
print(f"Intervalle de confiance : [{MyMean-E:.2f}, {MyMean+E:.2f}]")

E                       : 0.58
Intervalle de confiance : [4.27, 5.43]


## Exo 2 - Google
* Google would like to know the mean monthly revenue generated per client on their Adwords platform. 
* Due to lack of time, they have launched a questionnaire on only 31 people.

In [28]:
# Open the monthly_spending_adwords.xlsx file.
df = pd.read_excel("assets\Python\Monthly spending on adwords.xlsx")
df.head()

Unnamed: 0,Monthly spending
0,562.460391
1,382.54366
2,947.684175
3,425.084351
4,546.562302


In [29]:
# Calculate the mean and standard deviation of the corresponding column.
MyMean = df["Monthly spending"].mean()
MyStd = df["Monthly spending"].std()

print(f"Moyenne = {MyMean:.2f} et Ecart type = {MyStd:.2f}")


Moyenne = 491.77 et Ecart type = 253.96


$$E = t \frac{s}{\sqrt{n}}$$

In [31]:
# We would like to know the mean of the total population. Calculate the 95% confidence interval.
# 95% => t = 1.96 MAIS on a 31 personnes => Student avec df=30 => 2.043 sur https://www.statdistributions.com/t/
t = 2.043 
n =len(df)
E = t * MyStd/n**.5
print(f"E                       : {E:.2f}")
print(f"Intervalle de confiance : [{MyMean-E:.2f}, {MyMean+E:.2f}]")

E                       : 93.19
Intervalle de confiance : [398.59, 584.96]


In [19]:
from scipy import stats
Std_Err_Of_Mean = stats.sem(df["Monthly spending"])
alpha = 0.95
Conf_Interval = stats.t.interval(alpha, len(df["Monthly spending"])-1, loc=MyMean, scale=Std_Err_Of_Mean)
print(f"Intervalle de confiance : [{Conf_Interval[0]:.2f}, {Conf_Interval[1]:.2f}]")

Intervalle de confiance : [398.62, 584.93]


## Exo 3 - Nintendo
* Nintendo is building their new console. 
* However, the last console they launched was a disaster because no one bought it. 
* This time, Nintendo would like to know how many cities they would have to survey to be 95% sure that they will build the right number of systems. 
* The company will accept an error of 5000 consoles.

In [20]:
# Open the Number of switch sold.xlsx file.
df = pd.read_excel("./assets/Python/Number of switch sold.xlsx")
df.head()

Unnamed: 0,City,Number of sales
0,New York,65538.794054
1,San Francisco,45709.28987
2,Seattle,87417.957092
3,Denver,96461.186529
4,Austin,3665.528408


In [21]:
# Calculate the mean and standard deviation of the corresponding column.
MyMean = df["Number of sales"].mean()
MyStd = df["Number of sales"].std()

print(f"Moyenne = {MyMean:.2f} et Ecart type = {MyStd:.2f}")

# Set the margin of error to 5000.
E = 5000

Moyenne = 42996.94 et Ecart type = 28365.34


$$E = t \frac{s}{\sqrt{n}}$$

Et donc 

$$ \sqrt{n} = t \frac{s}{E}$$

$$ n = (t \frac{s}{E})^2$$


In [22]:
# Find the minimum number of cities to study.
# 95% => 1.96
t = 1.96 
n = (t*MyStd/E)**2
print(f"Nombre minimum de villes : {int(n+1)}")

Nombre minimum de villes : 124


## Exo 4

* Apple has just launched their new IOS for Iphone and would like to know if customers are satisfied with this version or not. 
* They asked 40 people about their satisfaction with the OS and collected their answers. 
* For educational purposes, we have marked the satisfied with a 1 and the dissatisfied with a 0.

In [23]:
# Open the satisfaction apple release.xlsx file.
df = pd.read_excel("./assets/Python/satisfaction apple release.xlsx")
df.head()




Unnamed: 0,Satisfied customer
0,0
1,0
2,0
3,0
4,0


In [24]:
# Set the number n of people in the sample.
n = len(df)

# Calculate k, the number of people satisfied with the OS.
k = df["Satisfied customer"].sum()

# Calculate p, the proportion of people satisfied with the OS.
p = k/n



$$E = z \sqrt{\frac{p (1-p)}{n}}$$


In [26]:
# Calculate the confidence interval of the proportion of the population.
E = 1.96 * (p*(1-p)/n)**.5

print(f"p                       : {p:.2f}")
print(f"E                       : {E:.2f}")
print(f"Intervalle de confiance : [{p-E:.2f}, {p+E:.2f}]")

p                       : 0.37
E                       : 0.15
Intervalle de confiance : [0.22, 0.51]


## Exo 5

* A laboratory has created a new drug and would like to test its effectiveness. 
* Based on tests done on rats, experts estimate that it should have an efficiency rate of about 90%. 
* They would like to know the number of individuals (n) that would have to be tested to confirm its 95% effectiveness with a Error margin E of 1%.

$$E = z \sqrt{\frac{p (1-p)}{n}}$$

donc

$$n = p (1-p) (\frac{z}{E})^2$$

In [22]:
# Set p, the estimated proportion.
p=.9
# Set z, the z−value for a 95% confidence level.
z = 1.96
# Set your margin of Error, E.
E = 0.01
# Calculate n.
n = p*(1-p)*(z/E)**2

print(f"Nombre de personnes à tester: {n:.2f} => {int(n+1)}")

Nombre de personnes à tester: 3457.44 => 3458
