In [9]:
import os
from pathlib import Path
import requests
import zipfile
import tarfile
import shutil
from warnings import filterwarnings
filterwarnings('ignore')
def download_to_cache(url, filename, force_download=False):
    def _get_custom_cache_dir():
        custom_cache_dir = Path.home() / ".cache"/ "skmini"/ "datasets"
        os.makedirs(custom_cache_dir, exist_ok=True)
        return custom_cache_dir

    file_path = _get_custom_cache_dir() / filename
    if os.path.isfile(file_path) or os.path.isdir(file_path):
        if not force_download:
            print('File Aldready exists. Use force_download=True to force download.')
    else:
        force_download=True

    if force_download:
        try:
            if os.path.isfile(file_path) or os.path.isdir(file_path):
                shutil.rmtree(file_path)  # Remove the directory and all its contents
                print(f"Successfully deleted the directory and its contents: {file_path}")
        except OSError as e:
            print(f"Error: {e.strerror}")

        response = requests.get(url, stream=True, verify=False)
        print(response)
        with open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    file.write(chunk)
        print(f"Downloaded file to {file_path}")
        # now that it is downloaded, we might want to decompress the file
        if tarfile.is_tarfile(file_path):
            try:
                tar = tarfile.open(file_path)
                os.remove(file_path)
                tar.extractall(path = file_path)
                tar.close()
            except:
                print('Decompression Failed.')
        elif zipfile.is_zipfile(file_path): # change to endswith, which doesnt require us loading a library
            try:
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    print(file_path)
                    os.remove(file_path)
                    os.mkdir(file_path)
                    zip_ref.extractall(file_path)
            except: print('Decompression Failed.')


In [10]:
download_to_cache("https://archive.ics.uci.edu/static/public/53/iris.zip", 'iris', force_download=True)

Successfully deleted the directory and its contents: /home/arjun/.cache/skmini/datasets/iris
<Response [200]>
Downloaded file to /home/arjun/.cache/skmini/datasets/iris
/home/arjun/.cache/skmini/datasets/iris


some issue between upper and lower download links, but the last one works

In [11]:
download_to_cache("https://github.com/datasciencedojo/datasets/raw/master/titanic.csv", 'titanic.csv')

File Aldready exists. Use force_download=True to force download.


In [12]:
from sklearn.datasets import load_iris
k = load_iris()
k

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [17]:
dataset = '''
6.2,2.8,4.8,1.8,Iris-virginica
6.1,3.0,4.9,1.8,Iris-virginica
6.4,2.8,5.6,2.1,Iris-virginica
7.2,3.0,5.8,1.6,Iris-virginica
7.4,2.8,6.1,1.9,Iris-virginica
7.9,3.8,6.4,2.0,Iris-virginica
'''

dataset.splitlines()

['',
 '6.2,2.8,4.8,1.8,Iris-virginica',
 '6.1,3.0,4.9,1.8,Iris-virginica',
 '6.4,2.8,5.6,2.1,Iris-virginica',
 '7.2,3.0,5.8,1.6,Iris-virginica',
 '7.4,2.8,6.1,1.9,Iris-virginica',
 '7.9,3.8,6.4,2.0,Iris-virginica']

In [21]:
from sklearn.datasets import load_diabetes
d = load_diabetes()
print(d['target'])

[151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  92.  83. 128. 102. 302. 198.  95.  53. 134. 144. 232.  81. 104.  59.
 246. 297. 258. 229. 275. 281. 179. 200. 200. 173. 180.  84. 121. 161.
  99. 109. 115. 268. 274. 158. 107.  83. 103. 272.  85. 280. 336. 281.
 118. 317. 235.  60. 174. 259. 178. 128.  96. 126. 288.  88. 292.  71.
 197. 186.  25.  84.  96. 195.  53. 217. 172. 131. 214.  59.  70. 220.
 268. 152.  47.  74. 295. 101. 151. 127. 237. 225.  81. 151. 107.  64.
 138. 185. 265. 101. 137. 143. 141.  79. 292. 178.  91. 116.  86. 122.
  72. 

In [24]:
from sklearn.datasets import load_diabetes

data = load_diabetes()
X, y = data.data, data.target  # X is for features, y is the target (disease progression)
y

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28