## Importing Panda

In [65]:
import pandas as pd
import numpy as np

## Reading CSV File & Showing First Five Rows

In [66]:
df = pd.read_csv('agora.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


### Shape of the data

In [67]:
df.shape

(50, 5)

### Counting the sum of null values in Every Column

In [68]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

# Handle Nan value

### Using Measure of Central Tendency

In [69]:
mean = df.Transport.mean()
mean

215331.7324489796

In [70]:
df.Transport = df.Transport.fillna(mean)

In [71]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

### Using bfill Method

In [72]:
df = pd.read_csv('agora.csv')
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [73]:
df.Transport = df.Transport.bfill(axis = 'rows')
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

### Using ffill Method

In [74]:
df = pd.read_csv('agora.csv')
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [75]:
df.Transport = df.Transport.bfill(axis = 'rows')
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

### Using replace Method

In [76]:
df = pd.read_csv('agora.csv')
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [77]:
mean = df.Transport.mean()
df.Transport = df.Transport.replace(np.nan, 0)
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

# Without Encoding Technique

In [78]:
df.Area.head()

0      Dhaka
1        Ctg
2    Rangpur
3      Dhaka
4    Rangpur
Name: Area, dtype: object

In [79]:
df.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [80]:
df.Area = df.Area.replace(['Dhaka', 'Ctg', 'Rangpur'],[0,2,4])

In [81]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,0,192261.83
1,162597.7,151377.59,443898.53,2,191792.06
2,153441.51,101145.55,407934.54,4,191050.39
3,144372.41,118671.85,383199.62,0,182901.99
4,142107.34,91391.77,366168.42,4,166187.94


# Label Encoder

In [82]:
df = pd.read_csv('agora.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [83]:
from sklearn.preprocessing import LabelEncoder

In [84]:
le = LabelEncoder()

In [85]:
df.Area = le.fit_transform(df['Area'])

In [86]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [87]:
df = pd.read_csv('agora.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


### If we want to change all the columns having String

In [88]:
for column in df.columns:
    if df[column].dtype == np.number:
        continue
    df[column] = le.fit_transform(df[column])

  if df[column].dtype == np.number:


In [89]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


# One Hot Encoding

In [90]:
df = pd.read_csv('agora.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [91]:
dummy_var = pd.get_dummies(df['Area'],drop_first=True)
dummy_var.head()

Unnamed: 0,Dhaka,Rangpur
0,1,0
1,0,0
2,0,1
3,1,0
4,0,1


In [92]:
df2 = df.drop('Area',axis=1)

In [93]:
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
0,114523.61,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [94]:
df = df2

In [95]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
0,114523.61,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [96]:
df = pd.concat([df,dummy_var],axis=1)

In [97]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,192261.83,1,0
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,1,0
4,142107.34,91391.77,366168.42,166187.94,0,1


# Ordinal Encoding

In [98]:
df = pd.read_csv('agora.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [99]:
df.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [100]:
city = ['Rangpur','Dhaka', 'Ctg']

In [101]:
from sklearn.preprocessing import OrdinalEncoder

In [102]:
ordi = OrdinalEncoder(categories=[city])

In [103]:
ord_fit = ordi.fit_transform(df[['Area']])

In [104]:
ord_fit

array([[1.],
       [2.],
       [0.],
       [1.],
       [0.],
       [1.],
       [2.],
       [0.],
       [1.],
       [2.],
       [0.],
       [2.],
       [0.],
       [2.],
       [0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [0.],
       [1.],
       [2.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [2.],
       [0.],
       [2.],
       [1.],
       [0.],
       [2.],
       [1.],
       [2.],
       [2.],
       [0.],
       [2.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.],
       [1.],
       [2.]])

# Hasing

In [105]:
df = pd.read_csv('agora.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [106]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)


ERROR: Exception:
Traceback (most recent call last):
  File "c:\users\anik chakraborty\appdata\local\programs\python\python39\lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
    yield
  File "c:\users\anik chakraborty\appdata\local\programs\python\python39\lib\site-packages\pip\_vendor\urllib3\response.py", line 519, in read
    data = self._fp.read(amt) if not fp_closed else b""
  File "c:\users\anik chakraborty\appdata\local\programs\python\python39\lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 62, in read
    data = self.__fp.read(amt)
  File "c:\users\anik chakraborty\appdata\local\programs\python\python39\lib\http\client.py", line 455, in read
    n = self.readinto(b)
  File "c:\users\anik chakraborty\appdata\local\programs\python\python39\lib\http\client.py", line 499, in readinto
    n = self.fp.readinto(b)
  File "c:\users\anik chakraborty\appdata\local\programs\python\python39\lib\socket.py", line 704, in readinto
    return s

Collecting statsmodels>=0.9.0
  Downloading statsmodels-0.12.2-cp39-none-win_amd64.whl (9.4 MB)


    from_path, content_type = download(link, temp_dir.path)
  File "c:\users\anik chakraborty\appdata\local\programs\python\python39\lib\site-packages\pip\_internal\network\download.py", line 145, in __call__
    for chunk in chunks:
  File "c:\users\anik chakraborty\appdata\local\programs\python\python39\lib\site-packages\pip\_internal\cli\progress_bars.py", line 144, in iter
    for x in it:
  File "c:\users\anik chakraborty\appdata\local\programs\python\python39\lib\site-packages\pip\_internal\network\utils.py", line 63, in response_chunks
    for chunk in response.raw.stream(
  File "c:\users\anik chakraborty\appdata\local\programs\python\python39\lib\site-packages\pip\_vendor\urllib3\response.py", line 576, in stream
    data = self.read(amt=amt, decode_content=decode_content)
  File "c:\users\anik chakraborty\appdata\local\programs\python\python39\lib\site-packages\pip\_vendor\urllib3\response.py", line 541, in read
    raise IncompleteRead(self._fp_bytes_read, self.length_remain

In [107]:
import category_encoders as ce

ModuleNotFoundError: No module named 'category_encoders'

In [None]:
enco = ce.HashingEncoder(cols='Area',n_components=3)

In [None]:
enc2 = enco.fit_transform(df['Area'])
enc2