# Aula 01 - Introdução ao Scikit-Learn
#### Material baseado no livro "Practical Machine Learning with Python" Sarkar, D. (et al.) (2018) 

---

# Numpy
## Arrays

In [7]:
import numpy as np
arr = np.array([1,3,4,5,6])
arr

array([1, 3, 4, 5, 6])

In [3]:
arr.shape

(5,)

In [4]:
arr.dtype

dtype('int32')

In [5]:
arr = np.array([1,'st','er',3])
arr.dtype

dtype('<U11')

In [8]:
np.sum(arr)

19

### Creating arrays

In [9]:
arr = np.array([[1,2,3],[2,4,6],[8,8,8]])
arr.shape

(3, 3)

In [10]:
arr

array([[1, 2, 3],
       [2, 4, 6],
       [8, 8, 8]])

In [11]:
arr = np.zeros((2,4))
arr

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [12]:
arr = np.ones((2,4))
arr

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [13]:
arr = np.identity(3)
arr

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [14]:
arr = np.random.randn(3,4)
arr

array([[-0.76371584, -1.92289079,  0.74695951, -0.98241008],
       [ 0.64055313,  1.2331538 ,  1.94513137,  0.07924897],
       [ 0.24178324,  0.75065216,  0.93192132, -2.49350462]])

### Accessing array elements
#### Simple indexing

In [15]:
arr[1]

array([0.64055313, 1.2331538 , 1.94513137, 0.07924897])

In [16]:
arr = np.arange(12).reshape(2,2,3)
arr

array([[[ 0,  1,  2],
        [ 3,  4,  5]],

       [[ 6,  7,  8],
        [ 9, 10, 11]]])

In [None]:
arr[0]

In [None]:
arr = np.arange(10)
arr[5:]


In [None]:
arr[5:8]

In [None]:
arr[:-5]

In [None]:
arr = np.arange(12).reshape(2,2,3)
arr

In [None]:
arr[1:2]

In [None]:
arr = np.arange(27).reshape(3,3,3)
arr

In [None]:
arr[:,:,2]

In [None]:
arr[...,2]

#### Advanced Indexing

In [17]:
arr = np.arange(9).reshape(3,3)
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [None]:
arr[[0,1,2],[1,0,0]]

##### Boolean Indexing

In [None]:
cities = np.array(["delhi","banglaore","mumbai","chennai","bhopal"])
city_data = np.random.randn(5,3)
city_data

In [None]:
city_data[cities =="delhi"]

In [None]:
city_data[city_data >0]

In [None]:
city_data[city_data >0] = 0
city_data


#### Operations on arrays

In [None]:
arr = np.arange(15).reshape(3,5)
arr

In [None]:
arr + 5

In [None]:
arr * 2

In [None]:
arr1 = np.arange(15).reshape(5,3)
arr2 = np.arange(5).reshape(5,1)
arr2 + arr1


In [None]:
arr1

In [None]:
arr2

In [None]:
arr1 = np.random.randn(5,3)
arr1

In [None]:
np.modf(arr1)

#### Linear algebra using numpy

In [None]:
A = np.array([[1,2,3],[4,5,6],[7,8,9]])
B = np.array([[9,8,7],[6,5,4],[1,2,3]])
A.dot(B)

In [None]:
A = np.arange(15).reshape(3,5)
A.T

# Pandas
## Data frames

In [5]:
import pandas as pd
d =  [{'city':'Delhi',"data":1000},
      {'city':'Banglaore',"data":2000},
      {'city':'Mumbai',"data":1000}]
pd.DataFrame(d)

Unnamed: 0,city,data
0,Delhi,1000
1,Banglaore,2000
2,Mumbai,1000


In [None]:
df = pd.DataFrame(d)

### Reading in data

In [None]:
city_data = pd.read_csv(filepath_or_buffer='simplemaps-worldcities-basic.csv')

In [None]:
city_data.head(n=10)

In [None]:
city_data.tail()

In [None]:
series_es = city_data.lat

In [None]:
type(series_es)

In [None]:
series_es[1:10:2]

In [None]:
series_es[:7]

In [None]:
series_es[:-7315]

In [None]:
city_data[:7]

In [None]:
city_data.iloc[:5,:4]

In [None]:
city_data[city_data['pop'] > 10000000][city_data.columns[pd.Series(city_data.columns).str.startswith('l')]]

In [None]:
city_greater_10mil = city_data[city_data['pop'] > 10000000]
city_greater_10mil.rename(columns={'pop':'population'}, inplace=True)
city_greater_10mil.where(city_greater_10mil.population > 15000000)

In [None]:
df = pd.DataFrame(np.random.randn(8, 3),
columns=['A', 'B', 'C'])

### Operations on dataframes

In [None]:
nparray = df.values
type(nparray)

In [None]:
from numpy import nan
df.iloc[4,2] = nan

In [None]:
df

In [None]:
df.fillna(0)

In [None]:
columns_numeric = ['lat','lng','pop']

In [None]:
city_data[columns_numeric].mean()

In [None]:
city_data[columns_numeric].sum()

In [None]:
city_data[columns_numeric].count()

In [None]:
city_data[columns_numeric].median()

In [None]:
city_data[columns_numeric].quantile(0.8)

In [None]:
city_data[columns_numeric].sum(axis = 1).head()

In [None]:
city_data[columns_numeric].describe()

In [None]:
city_data1 = city_data.sample(3)

### Concatanating data frames

In [None]:
city_data2 = city_data.sample(3)
city_data_combine = pd.concat([city_data1,city_data2])
city_data_combine

In [None]:
country_data = city_data[['iso3','country']].drop_duplicates()

In [None]:
country_data.shape

In [None]:
country_data.head()

In [None]:
del(city_data['country'])

In [None]:
city_data.merge(country_data, 'inner').head()

# Scikit-learn

In [16]:
from sklearn import datasets
diabetes = datasets.load_diabetes()
X = diabetes.data[:10]
y = diabetes.target

In [17]:
X[:5]

array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187235, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, -0.02632783, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, -0.00567061, -0.04559945,
        -0.03419447, -0.03235593, -0.00259226,  0.00286377, -0.02593034],
       [-0.08906294, -0.04464164, -0.01159501, -0.03665645,  0.01219057,
         0.02499059, -0.03603757,  0.03430886,  0.02269202, -0.00936191],
       [ 0.00538306, -0.04464164, -0.03638469,  0.02187235,  0.00393485,
         0.01559614,  0.00814208, -0.00259226, -0.03199144, -0.04664087]])

In [18]:
y[:10]

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310.])

In [20]:
feature_names=['age', 'sex', 'bmi', 'bp',
               's1', 's2', 's3', 's4', 's5', 's6']

## Scikit example regression

In [9]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MaxAbsScaler

from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

p = MaxAbsScaler()


diabetes = datasets.load_diabetes()
X_train = diabetes.data[:310]
y_train = diabetes.target[:310]

X_test = diabetes.data[310:]
y_test = diabetes.target[310:]

learner = Lasso(random_state=0)
alphas = np.logspace(-4, -0.5, 30)

p.fit(X_train)
x = p.transform(X_train)

p.fit(y_train)
y = p.transform(y_train)

learner.fit(x, y)

ValueError: Expected 2D array, got 1D array instead:
array=[151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  92.  83. 128. 102. 302. 198.  95.  53. 134. 144. 232.  81. 104.  59.
 246. 297. 258. 229. 275. 281. 179. 200. 200. 173. 180.  84. 121. 161.
  99. 109. 115. 268. 274. 158. 107.  83. 103. 272.  85. 280. 336. 281.
 118. 317. 235.  60. 174. 259. 178. 128.  96. 126. 288.  88. 292.  71.
 197. 186.  25.  84.  96. 195.  53. 217. 172. 131. 214.  59.  70. 220.
 268. 152.  47.  74. 295. 101. 151. 127. 237. 225.  81. 151. 107.  64.
 138. 185. 265. 101. 137. 143. 141.  79. 292. 178.  91. 116.  86. 122.
  72. 129. 142.  90. 158.  39. 196. 222. 277.  99. 196. 202. 155.  77.
 191.  70.  73.  49.  65. 263. 248. 296. 214. 185.  78.  93. 252. 150.
  77. 208.  77. 108. 160.  53. 220. 154. 259.  90. 246. 124.  67.  72.
 257. 262. 275. 177.  71.  47. 187. 125.  78.  51. 258. 215. 303. 243.
  91. 150. 310. 153. 346.  63.  89.  50.  39. 103. 308. 116. 145.  74.
  45. 115. 264.  87. 202. 127. 182. 241.  66.  94. 283.  64. 102. 200.
 265.  94. 230. 181. 156. 233.  60. 219.  80.  68. 332. 248.  84. 200.
  55.  85.  89.  31. 129.  83. 275.  65. 198. 236. 253. 124.  44. 172.
 114. 142.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [10]:
y_pred = learner.predict(X_test)
y_pred

NotFittedError: This Lasso instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [15]:
np.mean(abs(y_pred - y_test))

51.37617602903816

In [9]:
x_train

NameError: name 'x_train' is not defined