In [None]:
ctemps = [5, 10, 12, 14, 10, 23, 41, 30, 12, 24, 12, 18, 29]

ftemps1 = [(t * 9/5) + 32 for t in ctemps]
ftemps2 = {(t * 9/5) + 32 for t in ctemps}
print("\n", ftemps1, type(ftemps1))
print("\n", ftemps2, type(ftemps2))


evens = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

list_evenSquared = [e ** 2 for e in evens]
print("\nList comprehension", list_evenSquared)

set_evenSquared = {e ** 2 for e in evens}
print("\nSet comprehension", set_evenSquared)

In [None]:
import numpy as np

a = np.array([[1, 2],
              [3, 4]])

b = np.array([[5, 6],
              [7, 8]])

print("Vertical stacking:", np.vstack((a, b)))

# horizontal stacking
print("Horizontal stacking:", np.hstack((a, b)))


c = [5, 6]
# stacking columns
print("Column stacking:", np.column_stack((a, c)))


In [None]:
import pandas as pd
import numpy as np
s = pd.Series([0, 1, 4, 9, 16, 25], name='squares')
print(s)
print(s.index)
print(s.values, s.index)
print(s[2:4])

In [None]:
pop2014 = pd.Series([100, 99.3, 95.5, 93.5, 92.4, 84.8, 84.5, 78.9, 74.3, 72.8],
                    index=['Java', 'C', 'C++', 'Python', 'C#', 'PHP', 'JavaScript', 'Ruby', 'R', 'Matlab'])

pop2015 = pd.Series({'Java': 100, 'C': 99.9, 'C++': 99.4, 'Python': 96.5, 'C#': 91.3,
                     'R': 84.8, 'PHP': 84.5, 'JavaScript': 83.0, 'Ruby': 76.2, 'Matlab': 72.4})

print(pop2014)

print(pop2015) # index sort

In [None]:
print(pop2014.index)
print(pop2014.iloc[0:2])

print(pop2014.loc[:'Ruby'])

loc gets rows (or columns) with particular labels from the index.

iloc gets rows (or columns) at particular positions in the index (so it only takes integers).

ix usually tries to behave like loc but falls back to behaving like iloc if a label is not present in the index

In [None]:
twoyears = pd.DataFrame({'2014': pop2014, '2015': pop2015})
print(twoyears)

In [None]:
twoyears['Average'] = 0.5*(twoyears['2014'] + twoyears['2015']) 
print(twoyears)

In [None]:
test_data = pd.DataFrame(np.random.choice(['a', 'b', 'c', 'e'], (3, 3)), index=[1, 2, 3], columns=['AA', 'BB', 'CC'])
print(test_data)

# pandas aggregation

In [None]:
open('tips.csv','r').readlines()[:10]

In [None]:
tips = pd.read_csv('tips.csv')

In [None]:
tips.head()

In [None]:
tips.mean()

In [None]:
tips.dtypes

In [None]:
tips.describe()

In [None]:
tips.shape # row count

In [None]:
tips.groupby('gender').mean()

In [None]:
tips.groupby(['gender','smoker']).mean()

In [None]:
pd.pivot_table(tips,'total_bill','gender','smoker')

# pandas.pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', 
#         fill_value=None, margins=False, dropna=True, margins_name='All')

# Create a spreadsheet-style pivot table as a DataFrame.

In [None]:
pd.pivot_table(tips,'total_bill',['gender','smoker'],['day','time'])
# pandas.pivot_table(data, values=None, index=None, columns=None, aggfunc='mean')


# Data Frame Creation and visualization 

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
url='http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
df = pd.read_csv(url, header = None)

df.head()

In [None]:
df.columns = ['sepal_length','sepal_width','petal_length','petal_width','flower_type']
df['flower_type'] = df['flower_type'].astype('category')
df.flower_type = df.flower_type.cat.rename_categories([0,1,2])

df.head()

In [None]:
df['flower_type'].describe()

s = pd.Series(['a', 'a', 'b', 'c'])

s.describe()

count     4
unique    3
top       a
freq      2
dtype: object

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html


In [None]:
df.hist()
plt.show()

In [None]:
pd.scatter_matrix(df, diagonal='kde')
plt.show()

‘bar’ or ‘barh’ for bar plots
‘hist’ for histogram
‘box’ for boxplot
‘kde’ or ‘density’ for density plots
‘area’ for area plots
‘scatter’ for scatter plots
‘hexbin’ for hexagonal bin plots
‘pie’ for pie plots
https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#visualization-kde



More general:
http://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html


# More Operations on the Data Frame

In [None]:
df = df.sort_values(by='sepal_width')
df.head()

In [None]:
# Normalizing your data set
df=df.ix[:,0:4].apply( lambda f: ( f - f.mean() )/( f.max() - f.min() ) )
df.hist()
plt.show()

In [None]:
# Get a random sample from the data set
df=df.sample(frac=1.0)
df.head()

In [None]:
# Split the data set into test and train set
train=df.sample(frac=0.8,random_state=123)
test=df.drop(train.index)

# Read/Write

In [None]:
df.to_csv('iris_normalized.csv')
new_df = pd.read_csv('iris_normalized.csv')
new_df.head()

# Deal with missing data 

In [None]:
import numpy as np
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                    [3, 4, np.nan, 1],
                    [np.nan, np.nan, np.nan, 5],
                    [np.nan, 3, np.nan, 4]],
                    columns=list('ABCD'))
df

In [None]:
df.fillna(0)

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html#pandas.DataFrame.fillna

In [None]:
 df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
                    "toy": [np.nan, 'Batmobile', 'Bullwhip'],
                    "born": [pd.NaT, pd.Timestamp("1940-04-25"),
                             pd.NaT]})
df

In computer programming, a sentinel value (also referred to as a flag value, trip value, rogue value, signal value, or dummy data) is a special value in the context of an algorithm which uses its presence as a condition of termination, typically in a loop or recursive algorithm.

floating-point NaN

In [None]:
df.dropna()

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html#pandas.DataFrame.dropna

# Features extraction

# Text Features

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

sample = ['sample of evil', 'evil queen', 'horizon problem']
vec = CountVectorizer()
X = vec.fit_transform(sample)
# print(X.toarray())
feature_extraction = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
print(feature_extraction)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()
X = vec.fit_transform(sample)
feature_extraction = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
print(feature_extraction)

# Derived Features

In [None]:
%matplotlib inline 
# For jupyter notebook only
import numpy as np
import matplotlib.pyplot as plt

x = np.array([1, 2, 3, 4, 5])
y = np.array([4, 2, 1, 3, 7])
plt.scatter(x, y)
plt.plot(x,y)
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
x = np.array([1, 2, 3, 4, 5])
y = np.array([4, 2, 1, 3, 7])
X = x[:, np.newaxis]
model = LinearRegression().fit(X, y)
yfit = model.predict(X)
plt.scatter(x,y)
plt.plot(x, yfit)
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
x = np.array([1, 2, 3, 4, 5])
y = np.array([4, 2, 1, 3, 7])
X = x[:, np.newaxis]
print("X\n", X)
poly = PolynomialFeatures(degree=3, include_bias=False)
X2 = poly.fit_transform(X)
# https://datascience.stackexchange.com/questions/12321/difference-between-fit-and-fit-transform-in-scikit-learn-models
print("\nX2\n", X2)


model = LinearRegression().fit(X2, y)
yfit = model.predict(X2)
plt.scatter(x,y)
plt.plot(x, yfit)
plt.show()


#     degree : integer
#     The degree of the polynomial features

#     include_bias : boolean
#     If True (default), then include a bias column,
#     the feature in which all polynomial powers are zero
#     (i.e. a column of ones - acts as an intercept term in a linear model).


#     rows = np.array([0, 3], dtype=np.intp)
#     columns = np.array([0, 2], dtype=np.intp)
#     rows[:, np.newaxis]
#     array([[0],
#        [3]])

#     # intp	Integer used for indexing (same as C ssize_t; normally either int32 or int64



# Scipy

• A collection of mathematical algorithms

• Gives Python similar capabilities as Matlab

• Many submodules are used for different domains

• We will see examples from linalg and optimize submodules

• For details: http://docs.scipy.org/doc/scipy/reference/tutorial/index.html

linalg: Linear Algebra submodule

Linear algebra submodule provides several routines for matrix computations.
For example to find the inverse of matrix A

In [None]:
from scipy import linalg as la
import numpy as np
A = np.array([[5,3,5],[2,2,0], [1,3,1]])
iA = la.inv(A)
print(iA)

# Solving linear systems of equations
Ax=b

In [None]:
A = np.array([[5,3,5], [2,2,0], [1,3,1]])
b = np.array([ 2, 5, 1])
x = la.solve(A,b)
print('Solution:', x)
# x = la.inv(A).dot(b) # same result

# More Will Come...

# http://book.pythontips.com/en/latest/map_filter.html
    
map filter reduce