In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'digit-recognizer:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F3004%2F861823%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240603%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240603T191204Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D6650c4d507280764dd5c4a05c363bbbc03f2f4036539704b7e7d7446d30caa7ff5cec927258c3d1851f93687f51e752e64b1e857107897eb9b4bc51dd4628b721e40ab8980954f204388a03fce76fe15fa9eaa044de1476519a5def0acfe605db004098a25340819ac10c1c6195dafcad52d76fa3b798ebe68101ecc231e7d0ba719b1c9455e7edf4079d2dbcbdf2c3d162cd494d0bd8c03d75eacf8c7fa9b961828d75333526407b6bbf6ac2a4a76ee43f7b54954f8e4b4c48be65f775fb2ec929805a6279389e173afcc3485d2ebbd693464dffb419fc2c14366b21fbd28eba0e9d175c2b76dbc9caddcced079a35c17146475a1acaa7e97699ff30e07e6d7'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv


In [None]:
df = pd.read_csv( "/kaggle/input/digit-recognizer/train.csv")
x = df.iloc[: , 1:]
y = df.iloc[: , 0 ]


In [None]:
from sklearn.model_selection import train_test_split
xtrain , xtest , ytrain , ytest = train_test_split( x, y , test_size = 0.2 , random_state = 12 )
print( xtrain .shape )
print( ytrain.shape)

(33600, 784)
(33600,)


In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()

In [None]:
knn.fit( xtrain , ytrain )

In [None]:
ypred = knn.predict(xtest )

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score( ypred , ytest )

0.9654761904761905

In [None]:
#  now we need to do pca
# 1_ we want to scale the data
from sklearn.preprocessing  import StandardScaler
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain )
xtest = scaler.fit_transform( xtest )

In [None]:
# 2_ here were are just  importing the PCA class from the sklearn and then
from sklearn.decomposition import PCA
pca = PCA( n_components = 200 )
#  ncomponents is the hyperparameter to say how many features we need


In [None]:
xtrain_trf = pca.fit_transform( xtrain )
xtest_trf = pca.transform(xtest )
print(xtrain )
print(xtrain_trf)
# fit transform for the first only not for the second

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[ -6.27488562  -1.4666891   -4.37201045 ...   0.73984654   0.25056971
   -0.17068351]
 [  9.39620192 -10.87522393   2.26244107 ...   0.20086003  -0.59791258
   -0.90261773]
 [ -3.04279401  -8.6449548   -3.73884444 ...   0.05843031  -0.30412984
   -0.65297003]
 ...
 [ -1.10053884  10.16245139  -4.75531179 ...   0.34924377   0.04234743
   -0.72416057]
 [ 17.18332066   0.73284608  -2.30885401 ...  -0.49227667  -0.66040562
    0.36387692]
 [  4.35623933  -8.54453885  -2.92619886 ...   0.41583011   0.5999884
    1.28328976]]


In [None]:
knn = KNeighborsClassifier()
knn.fit( xtrain_trf , ytrain )
ypred = knn.predict( xtest_trf )
print( accuracy_score( ypred , ytest ))

0.9469047619047619


In [None]:
arr = []
for i in range(780 , 785 ):
    pca = PCA(n_components = i);
    xtrain_trf = pca.fit_transform(xtrain)
    xtest_trf = pca.transform(xtest)
    knn = KNeighborsClassifier()
    knn.fit(xtrain_trf , ytrain )
    ypred = knn.predict(xtest_trf)
    arr.append( (accuracy_score( ypred  , ytest ) , i ))

arr.sort(key = lambda e : e[0] )

for i in range( 0 , 5 ):
    print(arr[i])

(0.9371428571428572, 780)
(0.9371428571428572, 781)
(0.9371428571428572, 782)
(0.9371428571428572, 783)
(0.9371428571428572, 784)


In [None]:
print(arr)

In [None]:
# its visualizing time . pca is also use when we want to visualize data in lower dimension
pca = PCA(n_components = 2)
xtrain_trf = pca.fit_transform( xtrain )
xtest_trf = pca.transform( xtest )


In [None]:
print(xtrain_trf.shape)
df = pd.DataFrame( xtrain_trf , columns = ['f1' , 'f2'] )
df['target'] = ytrain
print( df.head())

(33600, 2)
         f1         f2  target
0 -6.274887  -1.467205     1.0
1  9.396179 -10.875211     0.0
2 -3.042775  -8.644564     1.0
3 -3.502984   1.874249     4.0
4 -6.744499   2.499243     0.0


In [None]:
import plotly.express as px
ytrain_trf = ytrain.astype(str)
fig = px.scatter( x = xtrain_trf[: , 0] , y = xtrain_trf[: ,1 ],
                 color = ytrain_trf ,
                 color_discrete_sequence= px.colors.qualitative.G10
                   )
fig.show()





In [None]:
# lets see the 3d version
pca = PCA( n_components = 3 )
xtrain_trf = pca.fit_transform(xtrain)
xtest_trf = pca.transform(xtest)
print( xtrain_trf.shape)


(33600, 3)


In [None]:
ytrain_trf = ytrain.astype(str)
fig = px.scatter_3d( x = xtrain_trf[: , 0 ] ,
                     y = xtrain_trf[: , 1] ,
                     z = xtrain_trf[: , 2] ,
                    color = ytrain_trf
                   )
fig.update_layout(
 margin = dict( l = 20 , r = 20 , t = 20 , b = 20 )
)
fig.show()





In [None]:
pca.explained_variance_
# for eigen values


array([40.54371445, 29.10917124, 26.90408132])

In [None]:
print(pca.components_.shape)
# for top 3 PCAs
# here the dimensionality is 784 so we are getting 3 x 784
#  (36000 x 784 ) x(784 x 3 ) = ( 36000 , 3)

(3, 784)
