In [304]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.arima.model import ARIMA

In [280]:
cov_ds = pd.read_csv('/content/drive/MyDrive/Datasets/pandas_data_analysis/covid.csv')
fif_ds = pd.read_csv('/content/drive/MyDrive/Datasets/pandas_data_analysis/players_19.csv')
hap_ds = pd.read_csv('/content/drive/MyDrive/Datasets/pandas_data_analysis/happy/2019.csv')
sup_ds = pd.read_csv('/content/drive/MyDrive/Datasets/pandas_data_analysis/train.csv')

In [16]:
# 1
hap_ds.sort_values(by='Score', ascending=False).head(5)

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [35]:
# 2
cov_ds[['New_cases', 'New_deaths']].corr()

Unnamed: 0,New_cases,New_deaths
New_cases,1.0,0.288404
New_deaths,0.288404,1.0


In [97]:
# 3
x = sup_ds['Order Date'].dropna(axis=0, how='any')
x = pd.to_datetime(x, format='%d/%m/%Y')
sup_ds.insert(3, 'Year', x.dt.year)

yearly_revenue = sup_ds.groupby('Year')['Sales'].sum()
yearly_revenue.pct_change().idxmax()

2017

In [102]:
# 4
fif_ds[['weight_kg', 'overall']].corr()

Unnamed: 0,weight_kg,overall
weight_kg,1.0,0.15461
overall,0.15461,1.0


In [153]:
# 5
fif_ds['skill_moves'].fillna(fif_ds['skill_moves'].mean(), inplace=True)
fif_ds['dribbling'].fillna(fif_ds['dribbling'].mean(), inplace=True)

players = fif_ds[['skill_moves', 'dribbling']]
kmeans = KMeans(n_clusters=3)

fif_ds.insert(1, 'Cluster', kmeans.fit_predict(players))
fif_ds['Cluster'].value_counts()



Cluster
0    9381
2    5782
1    2922
Name: count, dtype: int64

In [None]:
# 6
date = pd.to_datetime(cov_ds['Date_reported'], format='%Y-%m-%d')
cov_ds.insert(0, 'Month', date.dt.to_period('M'))

cov_ds.groupby('Month').sum().agg({'New_cases':' mean', 'New_deaths':'mean'})
# the columns have nan values. fill with the mean will not maintain data fidelity

In [190]:
# 7
cat_sal_grp = sup_ds.groupby('Category')['Sales'].mean()
print(f'Highest average profit margin are from {cat_sal_grp.idxmax()}. \nLowest average profit margin are from {cat_sal_grp.idxmin()}')

Highest average profit margin are from Technology. 
Lowest average profit margin are from Office Supplies


In [221]:
# 8
hap2018_ds = pd.read_csv('/content/drive/MyDrive/Datasets/pandas_data_analysis/happy/2018.csv')

hap_18_19_ds = pd.DataFrame(hap2018_ds['Score'])
hap_18_19_ds['2019'] = hap_ds['Score']
hap_18_19_ds.rename(columns = {'Score': '2018'}, inplace=True)

mean_18 = hap_18_19_ds['2018'].mean()
mean_19 = hap_18_19_ds['2019'].mean()

print(f'Average score in 2018: {mean_18}')
print(f'Average score in 2019: {mean_19}')

Average score in 2018: 5.375916666666667
Average score in 2019: 5.407096153846155


In [232]:
# 9
x = fif_ds[['movement_sprint_speed', 'movement_agility']]
y = fif_ds['power_stamina']

model = LinearRegression()
model.fit(x, y)
print('Regression coefficients:', model.coef_)

Regression coefficients: [0.47296342 0.25139398]


In [236]:
# 10
head = hap_ds.nlargest(10, 'Score')
tail = hap_ds.nsmallest(10, 'Score')

head['Score'].mean() - tail['Score'].mean()

4.1518

In [281]:
# adding season by month column
months = pd.to_datetime(sup_ds['Order Date'], format='%d/%m/%Y')
sup_ds.insert(1, 'Month', months.dt.to_period('M'))

def season_type(month):
  temp = int(str(month).split('-')[1])
  if (12 <= temp <= 2):
    return 'Winter'
  elif (3 <= temp <= 5):
    return 'Spring'
  elif (6 <= temp <= 8):
    return 'Summer'
  else:
    return 'Fall'

seasons = sup_ds['Month'].apply(season_type)

sup_ds.insert(1, 'Season', seasons)
# it works! :D

In [283]:
# 11
sup_ds.groupby('Month')['Sales'].mean()

Month
2015-01    184.489701
2015-02     98.258522
2015-03    358.479201
2015-04    214.668115
2015-05    195.407463
2015-06    262.007142
2015-07    237.898190
2015-08    185.736551
2015-09    305.706093
2015-10    197.820082
2015-11    254.600198
2015-12    248.784885
2016-01    316.964168
2016-02    186.740797
2016-03    252.650925
2016-04    214.807978
2016-05    209.507206
2016-06    182.940884
2016-07    210.354846
2016-08    233.027482
2016-09    221.521425
2016-10    189.095960
2016-11    234.421805
2016-12    239.690036
2017-01    208.342596
2017-02    276.853193
2017-03    317.795398
2017-04    230.236708
2017-05    256.366100
2017-06    202.675949
2017-07    195.514199
2017-08    174.526859
2017-09    198.261865
2017-10    310.328297
2017-11    216.028677
2017-12    283.251837
2018-01    304.031287
2018-02    191.548052
2018-03    248.368830
2018-04    177.709551
2018-05    182.608259
2018-06    205.066926
2018-07    201.009435
2018-08    293.634804
2018-09    190.182976
2018

In [295]:
# 12
brazil = cov_ds[cov_ds['Country'] == 'Brazil']['New_cases'].pct_change()
india = cov_ds[cov_ds['Country'] == 'India']['New_cases'].pct_change()

print('Brazil:', brazil.mean())
print('India:', india.mean())

Brazil: 0.20187233276010455
India: 0.15583467961716777


In [299]:
# 13
fif_ds.groupby('player_positions')['shooting'].mean().sort_values(ascending=False)

player_positions
CF, RW, ST         91.0
LM, CAM, ST, LW    86.0
CF, RW, CAM, ST    83.0
LW, CF, ST         81.0
LW, LM, ST, CAM    80.5
                   ... 
CB, RB, CM         27.0
RWB, CM, ST        27.0
LB, RB, LM         26.5
RB, RM, LM, CB     26.0
GK                  NaN
Name: shooting, Length: 850, dtype: float64

In [301]:
# 14
hap_ds[['Score', 'GDP per capita']].corr()

Unnamed: 0,Score,GDP per capita
Score,1.0,0.793883
GDP per capita,0.793883,1.0


In [None]:
# 15
model = ARIMA(sup_ds['Sales'], order=(1, 1, 1))
model_fit = model.fit()
forecast = model_fit.forecast(steps=3)[0]
forecast