In [None]:
import numpy as np

In [None]:
cd ~/classes/DSC478/data

#### In this example, we consider the use of Numpy in handling a typical relational dataset (__[video_store.csv](../data/video_store.csv)__). Since some fields are not numeric, we cannot use "loadtxt" function to load the table. Instead we can use the more general function "genfromtxt" to load the data with dtype=None. This will create the 2d array with strings as entries. The numeric fields can later be converted into the appropriate data type.

In [None]:
vstable = np.genfromtxt("video_store.csv", delimiter=",", dtype=str)
print (vstable)

In [None]:
labels = vstable[0]
print (labels)

In [None]:
vstable = vstable[1:]
vstable[0:5]

#### Now we can convert columns to the appropriate type as necessary:

In [None]:
age = np.array(vstable[:,3], dtype=int)
print (age)

In [None]:
sal = np.array(vstable[:,2], dtype=float)
print (sal)

In [None]:
min_sal = sal.min()
max_sal = sal.max()
print (min_sal, max_sal)

In [None]:
visit_avg = np.array(vstable[:,5], dtype=float)
rentals = np.array(vstable[:,4], dtype=float)

#### Let's normalize Income using standard Python list comprehensions

In [None]:
norm_sal = [(x-min_sal)/(max_sal-min_sal) for x in sal]
print (norm_sal)

#### But, we can do this more easily and efficiently with NumPy

In [None]:
np.set_printoptions(precision=4, linewidth=80, suppress=True)

sal_range = max_sal - min_sal
norm_sal = (sal - min_sal) / sal_range
print (norm_sal)


#### Z-Score Standardization on Age

In [None]:
age_mean = age.mean()
age_std = age.std()
print (age_mean, age_std)

In [None]:
age_znorm = (age - age_mean) / age_std
print (age_znorm)

#### Suppose that we would like to find all "good cutomers" defined as those with Rentals value of >= 30:

In [None]:
is_good = np.array(rentals >= 30)
good_cust = np.array(vstable[is_good])
print (good_cust)

#### Now, suppose we want to change the Gender atrribute into binary (converting it from one cateogrical attribute with two values into two attributes (say 'Gender_F' and 'Gender_M') with 0 or 1 as values depending on the original value. These new variables are sometimes called "dummy" variable. The purpose of this transfromation is to allow for the application of technqiues that require all attributes to be numerical (e.g., regression analysis or correlation analysis). Below, we show how this could be done manually for illustration purposes. In practice (as we shall see later in this notebook), there are Python libraries and packages that perform this type of transformation automatically.

In [None]:
gender = np.array(vstable[:,1])
gender

In [None]:
gen_f = np.zeros(len(gender))
gen_f

In [None]:
gen_f[gender=='F'] = 1
gen_f

In [None]:
gen_m = np.zeros(len(gender))
gen_m[gender=='M'] = 1
gen_m

#### Let's now create a new 2d array with the old Gender attributes replaced with the new ones. In the example below, we have removed the two other categorical attributes (Incidentals and Genre) for now, just to illustrate what the data would look like in "Standard Spreadsheet Fromat":

In [None]:
vs_new = np.array([gen_f,gen_m,sal,age,rentals,visit_avg])
vs_new = vs_new.T

np.set_printoptions(linewidth=80)

#Here are the first 5 elements of the new array
print (vs_new[0:5])

#### Note that we can apply standard statistical or numeric functions to the whole array and not just to individual columns:

In [None]:
f_mean,m_mean,sal_mean,age_mean,rentals_mean,visavg_mean= vs_new.mean(axis=0)

print ("       Gen=F Gen=M Income  Age  Rntls  VisAvg")
print ("Mean: ", f_mean,m_mean,sal_mean,age_mean,rentals_mean,visavg_mean)

In [None]:
#Now that the data is in all numeric form, we can apply techiques such as correlation analysis on the variables
np.corrcoef(vs_new.T)

#### The new table can be written into a file using "savetxt" function:

In [None]:
out_file = open("new_video_store.csv", "w")
np.savetxt(out_file, vs_new, fmt='%d,%d,%1.2f,%1.2f,%1.2f,%1.2f', delimiter=',')

#### An alternative method for loading heterogenous (mixed type) data into an array is to specify the dtype and set "Names" to "True". This creates a structured array with each row representing a tuple. Each column can be accessed by the keys extracted from the first line of the data file.

In [None]:
vs = np.genfromtxt("video_store.csv", delimiter=",", names=True, dtype=(int, "|U1", float, int, int, float, "|U10"))
print (vs)

In [None]:
np.dtype(vs[0])

In [None]:
print (vs['Gender'])

In [None]:
print (vs['Income'])

In [None]:
print (sum(vs['Genre']=='Action'))

In [None]:
is_good = np.array(vs['Rentals'] >= 30)
good_cust = np.array(vs[is_good])
good_cust

In [None]:
print ("Min Rentals: ", good_cust['Rentals'].min())
print ("Max Rentals: ", good_cust['Rentals'].max())
print ("Rentals Mean: ", good_cust['Rentals'].mean())
print ("Rentals Median: ", np.median(good_cust['Rentals']))
print ("Rentals Std. Dev.: ", good_cust['Rentals'].std())

#### For most types of analysis, we would want to exclude the index column from the data (in this case the ID attribute). This could be done by removing the first column of the matrix. However, it could have been done when reading the data in using the "usecols" parameter in "genfromtxt".

In [None]:
vs_nid = np.genfromtxt("video_store.csv", delimiter=",", usecols=(1,2,3,4,5,6), names=True, dtype=("|U1", float, int, int, float, "|S10"))
vs_nid[0:5]

#### Once the data is in structured array format as above, we can combine the tuples with feature names to create an array of dicts. The DictVectorizer package from the Scikit-learn library can then be used to create dummy variables for each of the categorical attriibutes and convert the data into the standard spreadsheet format. This is the preferred approach for creating dummy variables than the manual approach discussed earlier in cells 25-30.

In [None]:
names = vs_nid.dtype.names

vs_dict = [dict(zip(names, record)) for record in vs_nid] 

In [None]:
print (vs_dict[0])

In [None]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
# works in python 2 - deprecated in python 3
#vs_vec = DictVectorizer()

In [None]:
np.set_printoptions(linewidth=100)

vs_dict
vs_vect = []
for item in vs_dict:
    new_dict = {}
    for k,v in item.items():
        #print ("key",k,v)
        if k == "Gender":
            if v == 'M':
                new_dict["Gender=M"] = 1
                new_dict["Gender=F"] = 0
            elif v == 'F':
                new_dict["Gender=F"] = 1
                new_dict["Gender=M"] = 0
        if k == "Genre":
            if v == 'Action':
                new_dict["Genre=Action"] = 1
                new_dict["Gender=Comedy"] = 0
                new_dict["Gender=Drama"] = 0
            elif v == 'Comedy':
                new_dict["Genre=Action"] = 0
                new_dict["Gender=Comedy"] = 1
                new_dict["Gender=Drama"] = 0
            elif v == 'Drama':
                new_dict["Genre=Action"] = 0
                new_dict["Gender=Comedy"] = 0
                new_dict["Gender=Drama"] = 1
        else:
            new_dict[k] = v
        
    vs_vect.append(new_dict)

#print(len(vs_dict))
#print(len(vs_vect))


In [None]:
#vs_vec.get_feature_names()
vs_vect[0].keys()

income = np.array([d.get('Income') for d in vs_vect])
age = np.array([d.get('Age') for d in vs_vect])



#### Now let's do some visualization using the Mathplotlib library to get some more insight into the characteristics of the variables and their relationships.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.hist(income, bins=9, alpha=0.5)
plt.xlabel('Income')
plt.ylabel('Count')
plt.title('Histogram of Income')
plt.axis([0, 100000, 0, 10])
plt.grid(True)
plt.show()

#### Let's cross-tabulate the Genre and the Gender attributes to find out if men and women have different movie preferences. [Note: correlation analysis perfromed earlier could also shed some light on this question.]

In [None]:
# First we need the counts for males and females across different genres
m_counts = [14, 6, 8]  # counts of Action, Comedy, Drama for male custs.
f_counts = [8, 6, 12]  # counts of Action, Comedy, Drama for female custs.
N = len(f_counts)

In [None]:
ind = np.arange(N)  # the x locations for the groups
ind = ind + 0.15
width = 0.35       # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(ind, f_counts, width, color='b')
rects2 = ax.bar(ind+width, m_counts, width, color='r')

rects1 = plt.bar(ind, f_counts, width, color='b')
rects2 = plt.bar(ind+width, m_counts, width, color='r')


ax.set_ylabel('Counts')
ax.set_ybound(upper=16)
ax.set_title('Counts by Genre and Gender')
ax.set_xticks(ind+width)
ax.set_xticklabels( ('Action', 'Comedy', 'Drama') )

ax.legend( (rects1[0], rects2[0]), ('Female', 'Male') )

plt.show()

# plt.savefig("figure.pdf")

#### This figure shows that male customers tend to prefer action movies, while female customers tend to like dramas. 

#### Now, let's use a scatter plot discover possible correlations between Age and Income.

In [None]:
fig = plt.figure(figsize=(5, 4))
# Create an Axes object.
ax = fig.add_subplot(1,1,1) # one row, one column, first plot
# Plot the data.
ax.scatter(age, income, color="blue", marker="*")
# Add a title.
ax.set_title("Age VS. Income")
# Add some axis labels.
ax.set_xlabel("Age")
ax.set_ylabel("Income")
# Produce an image.
# fig.savefig("scatterplot.png")
plt.show()

#### Indeed, there does appear to be a positive correlation between Age and Income. This can also be observed by viewing the Pearson r correlation coefficient in our correlation matrix shown earlier.