# cuDF 
**cuDF is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.**

# Load libraries

In [None]:
import os

import pandas as pd
import numpy as np

import cupy as cp
import cudf as cd

# Import data from csv

### **movies_pdf** is our Pandas DF

In [None]:
movies_pdf = pd.read_csv("s3://bsql/data/rapids_intro/movies.csv")

### **movies_cdf** is our cuDF

In [None]:
movies_cdf = cd.read_csv("s3://bsql/data/rapids_intro/movies.csv", storage_options={'anon': True})

# Gather dataset statistics

In [None]:
print(movies_pdf.shape)
print(movies_pdf.ndim)
print(len(movies_pdf))

In [None]:
print(movies_cdf.shape)
print(movies_cdf.ndim)
print(len(movies_cdf))

# Explore Data

In [None]:
movies_pdf.head()

In [None]:
movies_cdf.head()

In [None]:
movies_pdf.info()

In [None]:
movies_cdf.info()

In [None]:
movies_pdf.columns

In [None]:
movies_cdf.columns

# Select subsets of the dataframe

### Select fields that are continuous data only

In [None]:
movies_pdf.select_dtypes(include="number").head()

In [None]:
movies_cdf.select_dtypes(include="number").head()

### Select fields that are floats only

In [None]:
movies_pdf.select_dtypes(include="float").head()

In [None]:
movies_cdf.select_dtypes(include="float").head()

### Select fields that are discrete values

In [None]:
movies_pdf.select_dtypes(include="object").head()

In [None]:
movies_cdf.select_dtypes(include="object").head()

# Data Analysis

### Summary statistics for all continuous data fields

In [None]:
movies_pdf.select_dtypes(include="number").describe()

In [None]:
movies_cdf.select_dtypes(include="number").describe()

### Summary statistics for all discrete value fields

In [None]:
movies_pdf.select_dtypes(include="object").describe()

In [None]:
movies_cdf.select_dtypes(include="object").describe()

### Transpose the cuDF describe results

(this can be done in Pandas too)

In [None]:
movies_cdf.select_dtypes(include="number").describe().T

### Covaraiance calculation of two continuous variables

In [None]:
movies_pdf.movie_facebook_likes.cov(movies_pdf.actor_3_facebook_likes)

In [None]:
movies_cdf.movie_facebook_likes.cov(movies_cdf.actor_3_facebook_likes)

### Pearson correlation of two continuous variables

In [None]:
movies_pdf.movie_facebook_likes.corr(movies_pdf.actor_3_facebook_likes)

In [None]:
movies_cdf.movie_facebook_likes.corr(movies_cdf.actor_3_facebook_likes)

### Groupby
Analyze the gross amounts generated by the two main actors

In [None]:
movies_pdf[['actor_1_name','actor_2_name','gross']].groupby(['actor_1_name','actor_2_name']).sum()

In [None]:
movies_cdf[['actor_1_name','actor_2_name','gross']].groupby(['actor_1_name','actor_2_name']).sum()

# Data Preparation

Genres has multiple values that are combinations of several genres.  For example: `Action|Adventure|Comedy|Fantasy|Sci-Fi`

In [None]:
print('There are ' + str(len(movies_pdf.genres.unique())) + ' genre combinations in the genres field')
print('Examples:\n', movies_pdf.genres.unique()[:10])

## Splitting the Genre column using Pandas

In [None]:
genres_pdf = movies_pdf.join(movies_pdf.genres.str.split('|', expand=True).add_prefix('genre_'))

## Splitting the Genre column using cuDF
cuDF does not have the add_prefix() option when splitting a column.  Here is one way to rename your columns in cuDF.

In [None]:
genre_fields = len(movies_cdf.genres.str.split('|', expand=True).columns)
print('There will be ' + str(genre_fields) + ' new columns that will be added into our dataframe\n')
genres_cdf = movies_cdf.join(movies_cdf.genres.str.split('|', expand=True))
genres_cdf.head()

**The new column names are assigned numbers and are not strings**

In [None]:
col_numbers = genres_cdf.columns[-genre_fields:].to_list()
print(col_numbers)

**Need to convert them to strings and give them a prefix and convert both lists into a dictionary to rename the new columns in the cuDF**

In [None]:
new_col_names = ['genre_' + str(x) for x in col_numbers] 
print(new_col_names)

new_col_dict = dict(zip(col_numbers, new_col_names))
print(new_col_dict)

In [None]:
genres_cdf = genres_cdf.rename(columns=new_col_dict)

In [None]:
genres_cdf.head()

# One Hot Encoding

## OHE using Pandas
Applied on the genre_0 column

In [None]:
pd_ohe = pd.get_dummies(genres_pdf.genre_0, prefix='genre_0')
df = pd.concat([genres_pdf, pd_ohe], axis=1)
df.head()

## OHE using cuDF
Applied on the genre_0 column

In [None]:
cdf = cd.get_dummies(genres_cdf, prefix='genre_0', columns=['genre_0'])
cdf.head()