In [1]:
import pandas as pd
import plotly.express as px
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Task ~

In [2]:
bitcoin_df = pd.read_csv('bitcoin_trade_data.csv', parse_dates=['Date'], date_format='%m/%d/%Y')
bitcoin_df.head(10)

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2024-02-24,50982.9,50739.6,50997.6,50592.0,40.10K,0.48%
1,2024-02-23,50740.5,51320.6,51532.5,50537.6,43.27K,-1.13%
2,2024-02-22,51320.4,51850.2,52015.8,50947.3,50.27K,-1.04%
3,2024-02-21,51858.2,52263.5,52367.3,50676.9,59.02K,-0.78%
4,2024-02-20,52263.5,51783.1,52936.8,50801.8,68.10K,0.93%
5,2024-02-19,51783.6,52119.6,52484.8,51694.2,36.73K,-0.64%
6,2024-02-18,52117.5,51646.0,52350.3,51199.6,26.89K,0.91%
7,2024-02-17,51646.0,52134.2,52175.5,50652.3,32.45K,-0.94%
8,2024-02-16,52134.2,51901.4,52556.7,51612.6,52.86K,0.45%
9,2024-02-15,51901.3,51805.2,52819.4,51327.5,74.72K,0.23%


In [3]:
# Stripping K from Vol. and % from Change columns
bitcoin_df['Vol.'] = (bitcoin_df['Vol.'].str.replace('K', 'e3').str.replace('M', 'e6')
                      .str.replace('B', 'e9'))
bitcoin_df['Vol.'] = bitcoin_df['Vol.'].astype('float32')
bitcoin_df['Change %'] = bitcoin_df['Change %'].str.replace('%', '').astype('float32')
bitcoin_df['Vol.']

0       40100.0
1       43270.0
2       50270.0
3       59020.0
4       68100.0
         ...   
4965     2160.0
4966      580.0
4967      260.0
4968      570.0
4969       80.0
Name: Vol., Length: 4970, dtype: float32

In [4]:
for column in ('Price', 'Open', 'High', 'Low'):
    bitcoin_df[column] = bitcoin_df[column].str.replace(',', '').astype('float32')

In [5]:
bitcoin_df.head(10)

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2024-02-24,50982.898438,50739.601562,50997.601562,50592.0,40100.0,0.48
1,2024-02-23,50740.5,51320.601562,51532.5,50537.601562,43270.0,-1.13
2,2024-02-22,51320.398438,51850.199219,52015.800781,50947.300781,50270.0,-1.04
3,2024-02-21,51858.199219,52263.5,52367.300781,50676.898438,59020.0,-0.78
4,2024-02-20,52263.5,51783.101562,52936.800781,50801.800781,68100.0,0.93
5,2024-02-19,51783.601562,52119.601562,52484.800781,51694.199219,36730.0,-0.64
6,2024-02-18,52117.5,51646.0,52350.300781,51199.601562,26890.0,0.91
7,2024-02-17,51646.0,52134.199219,52175.5,50652.300781,32450.0,-0.94
8,2024-02-16,52134.199219,51901.398438,52556.699219,51612.601562,52860.0,0.45
9,2024-02-15,51901.300781,51805.199219,52819.398438,51327.5,74720.0,0.23


In [6]:
bitcoin_df.isna().sum()

Date        0
Price       0
Open        0
High        0
Low         0
Vol.        6
Change %    0
dtype: int64

In [7]:
bitcoin_df['Price'].dtype

dtype('float32')

## Task 2

In [8]:
# Box plot fro change
fig = px.box(bitcoin_df, y='Change %', points='all')
fig.show()

In [9]:
# Box plot
fig = px.box(bitcoin_df, y='Vol.', points='all')
fig.show()

In [10]:
bitcoin_df[['Vol.', 'Change %']].isna().sum()

Vol.        6
Change %    0
dtype: int64

In [11]:
bitcoin_df[(bitcoin_df['Vol.'].isna())]

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
4627,2011-06-25,17.5,17.5,17.5,17.5,,0.0
4628,2011-06-24,17.5,17.5,17.5,17.5,,0.0
4629,2011-06-23,17.5,17.5,17.5,17.5,,0.0
4630,2011-06-22,17.5,17.5,17.5,17.5,,0.0
4631,2011-06-21,17.5,17.5,17.5,17.5,,0.0
4632,2011-06-20,17.5,17.5,17.5,17.5,,0.0


In [12]:
bitcoin_df.bfill(inplace=True)

In [13]:
from sklearn.ensemble import IsolationForest

In [14]:
# Isolation Forest anomaly detection for Vol. column
model = IsolationForest()
model.fit(bitcoin_df[['Vol.']])
bitcoin_df['vol_anom_score'] = model.decision_function(bitcoin_df[['Vol.']])
bitcoin_df['vol_anomaly'] = model.predict(bitcoin_df[['Vol.']])

In [15]:
# Isolation Forest anomaly detection for Chnage % column
model = IsolationForest()
model.fit(bitcoin_df[['Change %']])
bitcoin_df['change_anom_score'] = model.decision_function(bitcoin_df[['Change %']])
bitcoin_df['change_anomaly'] = model.predict(bitcoin_df[['Change %']])

In [16]:
# Rows with anomaly in Vol. columns
bitcoin_df[bitcoin_df['vol_anomaly'] == -1]

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %,vol_anom_score,vol_anomaly,change_anom_score,change_anomaly
657,2022-05-08,34060.000000,35468.000000,35497.000000,33727.000000,6.710500e+08,-3.97,-0.346673,-1,0.009686,1
658,2022-05-07,35468.000000,36003.000000,36119.000000,34773.000000,2.888600e+08,-1.50,-0.310245,-1,0.094621,1
659,2022-05-06,36009.000000,36540.000000,36646.000000,35267.000000,7.528400e+08,-1.46,-0.350693,-1,0.095775,1
660,2022-05-05,36544.000000,39686.000000,39833.000000,36183.000000,1.040000e+09,-7.92,-0.365790,-1,-0.083134,-1
661,2022-05-04,39688.000000,37717.000000,40021.000000,37660.000000,6.914900e+08,5.22,-0.348968,-1,0.001777,1
...,...,...,...,...,...,...,...,...,...,...,...
2017,2018-08-17,6581.700195,6306.700195,6581.700195,6291.799805,3.200000e+06,4.36,-0.166114,-1,0.030964,1
2018,2018-08-16,6306.700195,6256.899902,6470.899902,6249.799805,3.500000e+06,0.80,-0.172791,-1,0.123112,1
2019,2018-08-15,6256.899902,6190.200195,6592.799805,6177.899902,3.740000e+06,1.08,-0.175985,-1,0.120488,1
2020,2018-08-14,6190.200195,6255.299805,6255.299805,5898.299805,3.660000e+06,-1.04,-0.173702,-1,0.112759,1


In [17]:
# Rows with anomaly in Change % columns
bitcoin_df[bitcoin_df['change_anomaly'] == -1]

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %,vol_anom_score,vol_anomaly,change_anom_score,change_anomaly
33,2024-01-22,39556.398438,41581.699219,41684.898438,39468.398438,85100.0,-4.870000,0.150560,1,-0.004192,-1
43,2024-01-12,42835.898438,46348.101562,46503.199219,41857.898438,136920.0,-7.580000,0.132694,1,-0.070839,-1
47,2024-01-08,46962.199219,43934.199219,47196.699219,43251.000000,103090.0,6.910000,0.147931,1,-0.042813,-1
75,2023-12-11,41256.101562,43791.000000,43806.300781,40277.101562,105190.0,-5.790000,0.143150,1,-0.026671,-1
101,2023-11-15,37874.898438,35549.300781,37954.101562,35379.601562,75510.0,6.540000,0.154374,1,-0.035752,-1
...,...,...,...,...,...,...,...,...,...,...,...
4771,2011-02-01,0.700000,0.500000,0.900000,0.500000,31560.0,34.619999,0.162266,1,-0.300979,-1
4783,2011-01-20,0.400000,0.300000,0.400000,0.300000,19880.0,24.600000,0.144595,1,-0.267555,-1
4789,2011-01-14,0.400000,0.300000,0.400000,0.300000,27130.0,25.940001,0.159572,1,-0.275909,-1
4856,2010-11-08,0.200000,0.300000,0.400000,0.200000,118200.0,-28.530001,0.134745,1,-0.284885,-1


## Task 5

In [18]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [19]:
fig = make_subplots(rows=3, cols=2)
fig.add_trace(
    go.Histogram(x=bitcoin_df['Price'], name='Price histogram'),
    row=1, col=1
)
fig.add_trace(
    go.Histogram(x=bitcoin_df['Open'], name='Open histogram'),
    row=1, col=2
)
fig.add_trace(
    go.Histogram(x=bitcoin_df['High'], name='High histogram'),
    row=2, col=1
)
fig.add_trace(
    go.Histogram(x=bitcoin_df['Low'], name='Low histogram'),
    row=2, col=2
)
fig.add_trace(
    go.Histogram(x=bitcoin_df['Vol.'], name='Vol. histogram'),
    row=3, col=1
)
fig.add_trace(
    go.Histogram(x=bitcoin_df['Change %'], name='Change histogram'),
    row=3, col=2
)
fig.show()

In [20]:
bitcoin_df = bitcoin_df.sort_values(by='Date')

fig = make_subplots(rows=3, cols=2,
                    specs=[[{}, {}], [{}, {}], [{}, {}]])
fig.add_trace(
    go.Scatter(x=bitcoin_df['Date'], y=bitcoin_df['Price'], name='Price scatter'),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=bitcoin_df['Date'], y=bitcoin_df['Open'], name='Open scatter'),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(x=bitcoin_df['Date'], y=bitcoin_df['High'], name='High scatter'),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(x=bitcoin_df['Date'], y=bitcoin_df['Low'], name='Low scatter'),
    row=2, col=2
)
fig.add_trace(
    go.Scatter(x=bitcoin_df['Date'], y=bitcoin_df['Vol.'], name='Vol. scatter'),
    row=3, col=1
)
fig.add_trace(
    go.Scatter(x=bitcoin_df['Date'], y=bitcoin_df['Change %'], name='Change scatter'),
    row=3, col=2
)
fig.show()

In [21]:
bitcoin_df['Change %'].describe()

count    4970.000000
mean        0.412042
std         7.107526
min       -57.209999
25%        -1.170000
50%         0.000000
75%         1.770000
max       336.839996
Name: Change %, dtype: float64

In [22]:
bitcoin_df.mode(numeric_only=True)

Unnamed: 0,Price,Open,High,Low,Vol.,Change %,vol_anom_score,vol_anomaly,change_anom_score,change_anomaly
0,0.1,0.1,0.1,0.1,30180.0,0.0,0.160897,1,0.150944,1


## Task 4

In [23]:
# Manual min-max normalization
copy_df = bitcoin_df.copy().drop(['vol_anom_score', 'vol_anomaly', 'change_anom_score', 'change_anomaly'], axis=1)
for column in copy_df.select_dtypes(include=np.number).columns.to_list():
    min = copy_df[column].min()
    max = copy_df[column].max()
    copy_df[column] = (copy_df[column] - min) / (max - copy_df[column])
copy_df.sample(20)

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
1916,2018-11-26,0.060243,0.063985,0.064478,0.058731,0.000128,0.150914
939,2021-07-30,1.666501,1.453127,1.58138,1.371094,2.2e-05,0.189262
247,2023-06-22,0.794168,0.799043,0.792296,0.805301,1.8e-05,0.168629
3838,2013-08-22,0.001808,0.001829,0.001806,0.00182,3e-06,0.166208
95,2023-11-21,1.129254,1.245268,1.199994,1.172372,1.6e-05,0.154827
2518,2017-04-03,0.017287,0.016519,0.017068,0.016794,2.2e-05,0.185933
409,2023-01-11,0.361843,0.348196,0.352734,0.35347,6.5e-05,0.179967
2425,2017-07-05,0.04049,0.040321,0.039829,0.040174,2e-05,0.171234
394,2023-01-26,0.517073,0.518403,0.508612,0.526137,7.2e-05,0.169253
4066,2013-01-06,0.000197,0.000198,0.000194,0.000202,3e-06,0.169843


In [24]:
# Manual z-score normalization
copy_df = bitcoin_df.copy().drop(['vol_anom_score', 'vol_anomaly', 'change_anom_score', 'change_anomaly'], axis=1)
for column in copy_df.select_dtypes(include=np.number).columns.to_list():
    mean = copy_df[column].mean()
    std = copy_df[column].std()
    copy_df[column] = (copy_df[column] - mean) / std
copy_df.sample(20)

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
3946,2013-05-06,-0.675169,-0.674701,-0.674407,-0.675348,-0.079076,-0.511014
4818,2010-12-16,-0.682453,-0.682236,-0.682322,-0.682423,-0.07995,-0.057973
2751,2016-08-13,-0.644454,-0.644044,-0.644855,-0.643466,-0.079918,-0.118472
2488,2017-05-03,-0.585858,-0.588157,-0.587284,-0.587211,-0.079516,0.327534
4360,2012-03-18,-0.682128,-0.681911,-0.682004,-0.682089,-0.079895,0.103828
2542,2017-03-10,-0.610129,-0.604725,-0.597829,-0.616325,-0.078816,-0.990787
2461,2017-05-30,-0.53988,-0.533994,-0.534375,-0.538866,-0.079264,-0.586989
4703,2011-04-10,-0.682427,-0.682204,-0.68229,-0.682389,-0.080008,-0.057973
1062,2021-03-29,3.064522,2.946671,3.026966,2.991987,-0.079348,0.409138
1008,2021-05-22,1.752929,1.744484,1.780849,1.67897,-0.078846,-0.001694


In [25]:
# MinMax normalization using Scikit
from sklearn.preprocessing import MinMaxScaler

date_col = bitcoin_df['Date'].sort_values(ignore_index=True, ascending=True)
copy_df = bitcoin_df.copy().drop(['vol_anom_score', 'vol_anomaly', 'change_anom_score', 'change_anomaly', 'Date'], axis=1)
scaler = MinMaxScaler()
data = scaler.fit_transform(copy_df)
copy_df = pd.DataFrame(data=data, columns=copy_df.columns.to_list())
copy_df['Date'] = date_col.values
copy_df.sample(20)

Unnamed: 0,Price,Open,High,Low,Vol.,Change %,Date
4872,0.553166,0.541527,0.543618,0.54863,5.829977e-06,0.150641,2023-11-19
1804,0.003596,0.003593,0.003528,0.00362,8.39821e-06,0.145591,2015-06-26
1506,0.007065,0.007135,0.007065,0.007112,1.501119e-06,0.142774,2014-09-01
1523,0.006282,0.006828,0.006699,0.006163,1.979866e-06,0.124908,2014-09-18
1688,0.004053,0.003819,0.003982,0.003861,1.812304e-05,0.160741,2015-03-02
1798,0.003627,0.003615,0.003573,0.003598,1.145414e-05,0.146073,2015-06-20
4926,0.634343,0.686347,0.674051,0.631009,3.061297e-05,0.125948,2024-01-12
154,1e-06,3e-06,1e-06,3e-06,4.295302e-07,0.145185,2010-12-19
1217,0.00684,0.006425,0.006911,0.006452,4.263982e-06,0.161578,2013-11-16
1029,0.001699,0.001712,0.001702,0.001694,4.590604e-06,0.143383,2013-05-12


In [26]:
# Robust normalization using Scikit
from sklearn.preprocessing import RobustScaler

date_col = bitcoin_df['Date'].sort_values(ignore_index=True, ascending=True)
copy_df = bitcoin_df.copy().drop(['vol_anom_score', 'vol_anomaly', 'change_anom_score', 'change_anomaly', 'Date'], axis=1)
scaler = RobustScaler()
data = scaler.fit_transform(copy_df)
copy_df = pd.DataFrame(data=data, columns=copy_df.columns.to_list())
copy_df['Date'] = date_col.values
copy_df.sample(20)

Unnamed: 0,Price,Open,High,Low,Vol.,Change %,Date
3864,3.470454,3.36661,3.392147,3.530088,0.245745,1.064626,2021-02-14
2256,-0.069319,-0.067954,-0.069657,-0.069254,-0.294636,-0.034014,2016-09-20
2284,-0.067396,-0.065918,-0.067683,-0.067215,-0.281817,-0.108844,2016-10-18
1635,-0.09338,-0.091447,-0.092128,-0.094587,-0.310289,-1.010204,2015-01-08
72,-0.114205,-0.112927,-0.112764,-0.1162,-0.394608,0.0,2010-09-28
653,-0.113844,-0.112573,-0.112419,-0.115828,-0.204003,0.343537,2012-05-01
567,-0.113793,-0.112499,-0.112355,-0.115789,0.334895,-1.044218,2012-02-05
3767,1.015291,1.029157,1.00296,1.032031,0.622832,-0.343537,2020-11-09
2578,0.138509,0.137999,0.133718,0.143951,0.248579,0.27551,2017-08-08
4110,4.458889,4.425379,4.303974,4.535589,0.07601,0.292517,2021-10-18


## Task 6


In [39]:
fig = px.imshow(bitcoin_df.drop(['Date'], axis=1).corr(), text_auto=True, aspect='auto')
fig.show()

In [43]:
fig = px.imshow(bitcoin_df.corr(), text_auto=True, aspect='auto')
fig.show()

## Task 3

In [28]:
cars_df = pd.read_csv('cars.csv')
cars_df.sample(10)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
7419,Hyundai,50000,Diesel,First Owner,350000
7805,Volkswagen,65000,Diesel,Second Owner,365000
5421,Hyundai,60000,Diesel,First Owner,957000
3521,Toyota,68089,Petrol,First Owner,2000000
3162,Maruti,32000,Diesel,First Owner,690000
2084,Mahindra,120000,Diesel,First Owner,450000
841,Maruti,70000,Petrol,Second Owner,275000
3479,Tata,90000,Petrol,Second Owner,90000
4213,Maruti,60000,Diesel,First Owner,562000
5407,Ford,40000,Diesel,First Owner,600000


In [29]:
# Cutting dataset for better encoding

cars_df = cars_df[(cars_df['fuel'].isin(['Diesel', 'Petrol'])) &
                  (cars_df['owner'].isin(['First Owner', 'Second Owner']))]

In [30]:
encoded_df = pd.get_dummies(cars_df, columns=['fuel', 'owner'], drop_first=True)
encoded_df

Unnamed: 0,brand,km_driven,selling_price,fuel_Petrol,owner_Second Owner
0,Maruti,145500,450000,False,False
1,Skoda,120000,370000,False,True
3,Hyundai,127000,225000,False,False
4,Maruti,120000,130000,True,False
5,Hyundai,45000,440000,True,False
...,...,...,...,...,...
8122,Hyundai,80000,475000,False,True
8123,Hyundai,110000,320000,True,False
8125,Maruti,120000,382000,False,False
8126,Tata,25000,290000,False,False


In [31]:
encoded_df.sample(20)

Unnamed: 0,brand,km_driven,selling_price,fuel_Petrol,owner_Second Owner
2680,Skoda,11000,645000,True,False
450,Hyundai,25000,800000,True,False
3570,Tata,90000,200000,False,False
1217,Tata,30000,80000,True,False
856,Tata,150000,150000,True,False
7402,Hyundai,83000,475000,True,False
6348,Hyundai,75000,35000,True,True
1087,Maruti,5621,650000,True,False
4230,BMW,8500,5500000,False,False
510,Maruti,15000,755000,False,False


In [38]:
bitcoin_df['Date'].describe

<bound method NDFrame.describe of 4969   2010-07-18
4968   2010-07-19
4967   2010-07-20
4966   2010-07-21
4965   2010-07-22
          ...    
4      2024-02-20
3      2024-02-21
2      2024-02-22
1      2024-02-23
0      2024-02-24
Name: Date, Length: 4970, dtype: datetime64[ns]>