In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.metrics import homogeneity_score, silhouette_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import MiniBatchKMeans, DBSCAN

In [2]:
volunteer = pd.read_csv('https://assets.datacamp.com/production/repositories/1816/datasets/668b96955d8b252aa8439c7602d516634e3f015e/volunteer_opportunities.csv')

In [3]:
volunteer.head()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,...,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,4996,37004,50,0,Volunteers Needed For Rise Up & Stay Put! Home...,737,Building on successful events last summer and ...,,,,...,July 30 2011,approved,,,,,,,,
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,...,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,...,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,Strengthening Communities,...,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,...,February 05 2011,approved,,,,,,,,


In [4]:
volunteer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665 entries, 0 to 664
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   opportunity_id      665 non-null    int64  
 1   content_id          665 non-null    int64  
 2   vol_requests        665 non-null    int64  
 3   event_time          665 non-null    int64  
 4   title               665 non-null    object 
 5   hits                665 non-null    int64  
 6   summary             665 non-null    object 
 7   is_priority         62 non-null     object 
 8   category_id         617 non-null    float64
 9   category_desc       617 non-null    object 
 10  amsl                0 non-null      float64
 11  amsl_unit           0 non-null      float64
 12  org_title           665 non-null    object 
 13  org_content_id      665 non-null    int64  
 14  addresses_count     665 non-null    int64  
 15  locality            595 non-null    object 
 16  region  

In [5]:
# Check how many values are missing in the category_desc column
print(volunteer['category_desc'].isnull().sum())

# Subset the volunteer dataset
volunteer_subset = volunteer[volunteer['category_desc'].notnull()]

# Print out the shape of the subset
print(volunteer_subset.shape)

48
(617, 35)


In [6]:
volunteer.dtypes

opportunity_id          int64
content_id              int64
vol_requests            int64
event_time              int64
title                  object
hits                    int64
summary                object
is_priority            object
category_id           float64
category_desc          object
amsl                  float64
amsl_unit             float64
org_title              object
org_content_id          int64
addresses_count         int64
locality               object
region                 object
postalcode            float64
primary_loc           float64
display_url            object
recurrence_type        object
hours                   int64
created_date           object
last_modified_date     object
start_date_date        object
end_date_date          object
status                 object
Latitude              float64
Longitude             float64
Community Board       float64
Community Council     float64
Census Tract          float64
BIN                   float64
BBL       

### Converting a column type

In [7]:
# Print the head of the hits column
print(volunteer["hits"].head())

# Convert the hits column to type int
volunteer["hits"] = volunteer['hits'].astype('int64')

# Look at the dtypes of the dataset
print(volunteer.dtypes)

0    737
1     22
2     62
3     14
4     31
Name: hits, dtype: int64
opportunity_id          int64
content_id              int64
vol_requests            int64
event_time              int64
title                  object
hits                    int64
summary                object
is_priority            object
category_id           float64
category_desc          object
amsl                  float64
amsl_unit             float64
org_title              object
org_content_id          int64
addresses_count         int64
locality               object
region                 object
postalcode            float64
primary_loc           float64
display_url            object
recurrence_type        object
hours                   int64
created_date           object
last_modified_date     object
start_date_date        object
end_date_date          object
status                 object
Latitude              float64
Longitude             float64
Community Board       float64
Community Council     float64


### Stratified sampling

In [8]:
volunteer['Latitude'].unique()

array([nan])

In [9]:
df_volunteer = volunteer.dropna(axis=1).select_dtypes('number')           

In [10]:
df_volunteer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665 entries, 0 to 664
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   opportunity_id   665 non-null    int64
 1   content_id       665 non-null    int64
 2   vol_requests     665 non-null    int64
 3   event_time       665 non-null    int64
 4   hits             665 non-null    int64
 5   org_content_id   665 non-null    int64
 6   addresses_count  665 non-null    int64
 7   hours            665 non-null    int64
dtypes: int64(8)
memory usage: 41.7 KB


In [11]:
y_volunteer = volunteer[['category_desc']].fillna(axis=0, method = 'backfill')

In [12]:
# Create a data with all columns except category_desc
volunteer_X = df_volunteer

# Create a category_desc labels dataset
volunteer_y = y_volunteer[['category_desc']]

# Use stratified sampling to split up the dataset according to the volunteer_y dataset
X_train, X_test, y_train, y_test = train_test_split(volunteer_X, volunteer_y, stratify=volunteer_y)

# Print out the category_desc counts on the training y labels
print(y_train['category_desc'].value_counts())

Strengthening Communities    250
Helping Neighbors in Need     92
Education                     73
Health                        44
Environment                   26
Emergency Preparedness        13
Name: category_desc, dtype: int64


In [13]:
print(y_test['category_desc'].value_counts())

Strengthening Communities    84
Helping Neighbors in Need    31
Education                    25
Health                       14
Environment                   9
Emergency Preparedness        4
Name: category_desc, dtype: int64


### Modeling without normalizing

In [14]:
wine = pd.read_csv('https://assets.datacamp.com/production/repositories/1816/datasets/9bd5350dfdb481e0f94eeef6acf2663452a8ef8b/wine_types.csv')

In [15]:
wine.head()

Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [16]:
X = wine.drop('Type', axis=1)
y = wine['Type']

In [17]:
# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)
knn = knn()
# Fit the k-nearest neighbors model to the training data
knn.fit(X_train,y_train)

# Score the model on the test data
print(knn.score(X_test,y_test))

0.7111111111111111


### Log normalization in Python

In [18]:
# Print out the variance of the Proline column
print(wine['Proline'].var())

# Apply the log normalization function to the Proline column
wine['Proline_log'] = np.log(wine['Proline'])

# Check the variance of the normalized Proline column
print(wine['Proline_log'].var())

99166.71735542428
0.17231366191842018


### Scaling data - standardizing columns

In [19]:
# Import StandardScaler from scikit-learn
from sklearn.preprocessing import StandardScaler

# Create the scaler
ss = StandardScaler()

# Take a subset of the DataFrame you want to scale 
wine_subset = wine[['Ash', 'Alcalinity of ash', 'Magnesium']]

# Apply the scaler to the DataFrame subset
wine_subset_scaled = ss.fit_transform(wine_subset)

In [20]:
wine_subset_scaled.var()

1.0

In [21]:
# Split the dataset and labels into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y)

# Fit the k-nearest neighbors model to the training data
knn.fit(X_train, y_train)

# Score the model on the test data
print(knn.score(X_test, y_test))

0.6666666666666666


### KNN on scaled data

In [22]:
# Create the scaling method.
ss = StandardScaler()

# Apply the scaling method to the dataset used for modeling.
X_scaled = ss.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

# Fit the k-nearest neighbors model to the training data.
knn.fit(X_train, y_train)

# Score the model on the test data.
print(knn.score(X_test,y_test))

0.9555555555555556


### Encoding categorical variables - binary

In [23]:
hiking = pd.read_json('https://assets.datacamp.com/production/repositories/1816/datasets/4f26c48451bdbf73db8a58e226cd3d6b45cf7bb5/hiking.json')

In [24]:
hiking.head()

Unnamed: 0,Prop_ID,Name,Location,Park_Name,Length,Difficulty,Other_Details,Accessible,Limited_Access,lat,lon
0,B057,Salt Marsh Nature Trail,"Enter behind the Salt Marsh Nature Center, loc...",Marine Park,0.8 miles,,<p>The first half of this mile-long trail foll...,Y,N,,
1,B073,Lullwater,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,1.0 mile,Easy,Explore the Lullwater to see how nature thrive...,N,N,,
2,B073,Midwood,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.75 miles,Easy,Step back in time with a walk through Brooklyn...,N,N,,
3,B073,Peninsula,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Discover how the Peninsula has changed over th...,N,N,,
4,B073,Waterfall,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Trace the source of the Lake on the Waterfall ...,N,N,,


In [25]:
hiking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Prop_ID         33 non-null     object 
 1   Name            33 non-null     object 
 2   Location        33 non-null     object 
 3   Park_Name       33 non-null     object 
 4   Length          29 non-null     object 
 5   Difficulty      27 non-null     object 
 6   Other_Details   31 non-null     object 
 7   Accessible      33 non-null     object 
 8   Limited_Access  33 non-null     object 
 9   lat             0 non-null      float64
 10  lon             0 non-null      float64
dtypes: float64(2), object(9)
memory usage: 3.0+ KB


In [26]:
from sklearn.preprocessing import LabelEncoder

In [27]:
# Set up the LabelEncoder object
enc = LabelEncoder()

# Apply the encoding to the "Accessible" column
hiking['Accessible_enc'] = enc.fit_transform(hiking['Accessible'])

# Compare the two columns
print(hiking[['Accessible_enc', 'Accessible']].head())
print(hiking.head())
hiking.info()

   Accessible_enc Accessible
0               1          Y
1               0          N
2               0          N
3               0          N
4               0          N
  Prop_ID                     Name  \
0    B057  Salt Marsh Nature Trail   
1    B073                Lullwater   
2    B073                  Midwood   
3    B073                Peninsula   
4    B073                Waterfall   

                                            Location      Park_Name  \
0  Enter behind the Salt Marsh Nature Center, loc...    Marine Park   
1  Enter Park at Lincoln Road and Ocean Avenue en...  Prospect Park   
2  Enter Park at Lincoln Road and Ocean Avenue en...  Prospect Park   
3  Enter Park at Lincoln Road and Ocean Avenue en...  Prospect Park   
4  Enter Park at Lincoln Road and Ocean Avenue en...  Prospect Park   

       Length Difficulty                                      Other_Details  \
0   0.8 miles       None  <p>The first half of this mile-long trail foll...   
1    1.0 mil

### Encoding categorical variables - one-hot

In [28]:
# Transform the category_desc column
category_enc = pd.get_dummies(volunteer["category_desc"])

# Take a look at the encoded columns
print(category_enc.head())

   Education  Emergency Preparedness  Environment  Health  \
0          0                       0            0       0   
1          0                       0            0       0   
2          0                       0            0       0   
3          0                       0            0       0   
4          0                       0            1       0   

   Helping Neighbors in Need  Strengthening Communities  
0                          0                          0  
1                          0                          1  
2                          0                          1  
3                          0                          1  
4                          0                          0  


### Engineering numerical features - - datetime

In [29]:
# First, convert string column to date column
volunteer["start_date_converted"] = pd.to_datetime(volunteer["start_date_date"])

# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer["start_date_converted"].apply(lambda row: row.month)

# Take a look at the converted and new month columns
print(volunteer[["start_date_converted", "start_date_month"]].head())

  start_date_converted  start_date_month
0           2011-07-30                 7
1           2011-02-01                 2
2           2011-01-29                 1
3           2011-02-14                 2
4           2011-02-05                 2


### Engineering features from strings - extraction

In [30]:
import re

In [31]:
hiking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Prop_ID         33 non-null     object 
 1   Name            33 non-null     object 
 2   Location        33 non-null     object 
 3   Park_Name       33 non-null     object 
 4   Length          29 non-null     object 
 5   Difficulty      27 non-null     object 
 6   Other_Details   31 non-null     object 
 7   Accessible      33 non-null     object 
 8   Limited_Access  33 non-null     object 
 9   lat             0 non-null      float64
 10  lon             0 non-null      float64
 11  Accessible_enc  33 non-null     int32  
dtypes: float64(2), int32(1), object(9)
memory usage: 3.1+ KB


In [32]:
hiking.isna().sum()

Prop_ID            0
Name               0
Location           0
Park_Name          0
Length             4
Difficulty         6
Other_Details      2
Accessible         0
Limited_Access     0
lat               33
lon               33
Accessible_enc     0
dtype: int64

In [33]:
hiking.drop(columns=['lat', 'lon'], inplace=True)

In [34]:
hiking.dropna(inplace=True)

In [35]:
# Write a pattern to extract numbers and decimals
def return_mileage(length):
    pattern = re.compile(r"\d+\.\d+")
    
    # Search the text for matches
    mile = re.match(pattern, length)
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))
        
# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking['Length'].apply(lambda row: return_mileage(row))
print(hiking[["Length", "Length_num"]].head())

       Length  Length_num
1    1.0 mile        1.00
2  0.75 miles        0.75
3   0.5 miles        0.50
4   0.5 miles        0.50
5     Various         NaN


### Engineering features from strings - tf/idf

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
volunteer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665 entries, 0 to 664
Data columns (total 37 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   opportunity_id        665 non-null    int64         
 1   content_id            665 non-null    int64         
 2   vol_requests          665 non-null    int64         
 3   event_time            665 non-null    int64         
 4   title                 665 non-null    object        
 5   hits                  665 non-null    int64         
 6   summary               665 non-null    object        
 7   is_priority           62 non-null     object        
 8   category_id           617 non-null    float64       
 9   category_desc         617 non-null    object        
 10  amsl                  0 non-null      float64       
 11  amsl_unit             0 non-null      float64       
 12  org_title             665 non-null    object        
 13  org_content_id      

In [38]:
volunteer_text_clf = volunteer[["category_desc" ,'title']].dropna(axis=0)

In [39]:
# Take the title text
title_text = volunteer_text_clf["title"]

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)

#### Text classification using tf/idf vectors

In [40]:
from sklearn.naive_bayes import GaussianNB

In [41]:
# Split the dataset according to the class distribution of category_desc
y = volunteer_text_clf["category_desc"]
X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y)

nb = GaussianNB()
# Fit the model to the training data
nb.fit(X_train, y_train)

# Print out the model's accuracy
print(nb.score(X_test, y_test))

0.5225806451612903


In [42]:
y_pred = nb.predict(X_test)

In [43]:
print(classification_report(y_test, y_pred))

                           precision    recall  f1-score   support

                Education       0.36      0.70      0.48        23
   Emergency Preparedness       0.20      0.50      0.29         4
              Environment       0.30      0.38      0.33         8
                   Health       0.53      0.62      0.57        13
Helping Neighbors in Need       0.62      0.43      0.51        30
Strengthening Communities       0.71      0.51      0.59        77

                 accuracy                           0.52       155
                macro avg       0.45      0.52      0.46       155
             weighted avg       0.59      0.52      0.54       155



### Selecting relevant features

In [44]:
volunteer.head(3)

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,...,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA,start_date_converted,start_date_month
0,4996,37004,50,0,Volunteers Needed For Rise Up & Stay Put! Home...,737,Building on successful events last summer and ...,,,,...,,,,,,,,,2011-07-30,7
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,...,,,,,,,,,2011-02-01,2
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,...,,,,,,,,,2011-01-29,1


In [45]:
# Create a list of redundant column names to drop
to_drop = ["category_desc", "created_date", "locality", "region", "vol_requests"]

# Drop those columns from the dataset
volunteer_subset = volunteer.drop(to_drop, axis=1)

# Print out the head of the new dataset
print(volunteer_subset.head())

   opportunity_id  content_id  event_time  \
0            4996       37004           0   
1            5008       37036           0   
2            5016       37143           0   
3            5022       37237           0   
4            5055       37425           0   

                                               title  hits  \
0  Volunteers Needed For Rise Up & Stay Put! Home...   737   
1                                       Web designer    22   
2      Urban Adventures - Ice Skating at Lasker Rink    62   
3  Fight global hunger and support women farmers ...    14   
4                                      Stop 'N' Swap    31   

                                             summary is_priority  category_id  \
0  Building on successful events last summer and ...         NaN          NaN   
1             Build a website for an Afghan business         NaN          1.0   
2  Please join us and the students from Mott Hall...         NaN          1.0   
3  The Oxfam Action Corps is a g

In [46]:
# Print out the column correlations of the wine dataset
print(wine.corr())

# Take a minute to find the column where the correlation value is greater than 0.75 at least twice
to_drop = "Proline"

# Drop that column from the DataFrame
wine = wine.drop(to_drop, axis=1)


                                  Type   Alcohol  Malic acid       Ash  \
Type                          1.000000 -0.328222    0.437776 -0.049643   
Alcohol                      -0.328222  1.000000    0.094397  0.211545   
Malic acid                    0.437776  0.094397    1.000000  0.164045   
Ash                          -0.049643  0.211545    0.164045  1.000000   
Alcalinity of ash             0.517859 -0.310235    0.288500  0.443367   
Magnesium                    -0.209179  0.270798   -0.054575  0.286587   
Total phenols                -0.719163  0.289101   -0.335167  0.128980   
Flavanoids                   -0.847498  0.236815   -0.411007  0.115077   
Nonflavanoid phenols          0.489109 -0.155929    0.292977  0.186230   
Proanthocyanins              -0.499130  0.136698   -0.220746  0.009652   
Color intensity               0.265668  0.546364    0.248985  0.258887   
Hue                          -0.617369 -0.071747   -0.561296 -0.074667   
OD280/OD315 of diluted wines -0.788230

In [47]:
wine.corr()

Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline_log
Type,1.0,-0.328222,0.437776,-0.049643,0.517859,-0.209179,-0.719163,-0.847498,0.489109,-0.49913,0.265668,-0.617369,-0.78823,-0.569246
Alcohol,-0.328222,1.0,0.094397,0.211545,-0.310235,0.270798,0.289101,0.236815,-0.155929,0.136698,0.546364,-0.071747,0.072343,0.637325
Malic acid,0.437776,0.094397,1.0,0.164045,0.2885,-0.054575,-0.335167,-0.411007,0.292977,-0.220746,0.248985,-0.561296,-0.36871,-0.152643
Ash,-0.049643,0.211545,0.164045,1.0,0.443367,0.286587,0.12898,0.115077,0.18623,0.009652,0.258887,-0.074667,0.003911,0.238394
Alcalinity of ash,0.517859,-0.310235,0.2885,0.443367,1.0,-0.083333,-0.321113,-0.35137,0.361922,-0.197327,0.018732,-0.273955,-0.276769,-0.416897
Magnesium,-0.209179,0.270798,-0.054575,0.286587,-0.083333,1.0,0.214401,0.195784,-0.256294,0.236441,0.19995,0.055398,0.066004,0.424006
Total phenols,-0.719163,0.289101,-0.335167,0.12898,-0.321113,0.214401,1.0,0.864564,-0.449935,0.612413,-0.055136,0.433681,0.699949,0.431205
Flavanoids,-0.847498,0.236815,-0.411007,0.115077,-0.35137,0.195784,0.864564,1.0,-0.5379,0.652692,-0.172379,0.543479,0.787194,0.410494
Nonflavanoid phenols,0.489109,-0.155929,0.292977,0.18623,0.361922,-0.256294,-0.449935,-0.5379,1.0,-0.365845,0.139057,-0.26264,-0.50327,-0.275675
Proanthocyanins,-0.49913,0.136698,-0.220746,0.009652,-0.197327,0.236441,0.612413,0.652692,-0.365845,1.0,-0.02525,0.295544,0.519067,0.290203


### Exploring text vectors

In [48]:
vocab = {v:k for k,v in   tfidf_vec.vocabulary_.items()}

In [49]:
# Add in the rest of the parameters
def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
    
    # Let's transform that zipped dict into a series
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
    
    # Let's sort the series to pull out the top n weighted words
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    return [original_vocab[i] for i in zipped_index]

# Print out the weighted words
print(return_weights(vocab, tfidf_vec.vocabulary_, text_tfidf, 8, 3))

[189, 942, 466]


In [50]:
def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
    
        # Here we'll call the function from the previous exercise, and extend the list we're creating
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
    # Return the list in a set, so we don't get duplicate word indices
    return set(filter_list)

# Call the function to get the list of word indices
filtered_words = words_to_filter(vocab, tfidf_vec.vocabulary_, text_tfidf, 3)

# By converting filtered_words back to a list, we can use it to filter the columns in the text vector
filtered_text = text_tfidf[:, list(filtered_words)]

In [51]:
# Split the dataset according to the class distribution of category_desc, using the filtered_text vector
train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)

# Fit the model to the training data
nb.fit(train_X,train_y)

# Print out the model's accuracy
print(nb.score(test_X, test_y))

0.5612903225806452


In [52]:
pred_y = nb.predict(test_X)

In [53]:
print(classification_report(test_y, pred_y))

                           precision    recall  f1-score   support

                Education       0.47      0.87      0.61        23
   Emergency Preparedness       0.14      0.25      0.18         4
              Environment       0.40      0.25      0.31         8
                   Health       0.38      0.46      0.41        13
Helping Neighbors in Need       0.60      0.60      0.60        30
Strengthening Communities       0.74      0.52      0.61        77

                 accuracy                           0.56       155
                macro avg       0.45      0.49      0.45       155
             weighted avg       0.61      0.56      0.56       155



### Dimensionality reduction

In [54]:
from sklearn.decomposition import PCA

# Set up PCA and the X vector for diminsionality reduction
pca = PCA()
wine_X = wine.drop("Type", axis=1)

# Apply PCA to the wine dataset X vector
transformed_X = pca.fit_transform(wine_X)

# Look at the percentage of variance explained by the different components
print(pca.explained_variance_ratio_)

[9.08880881e-01 5.13734572e-02 2.50156553e-02 7.15814867e-03
 3.96467881e-03 1.50318506e-03 6.73038210e-04 4.99420623e-04
 3.28340366e-04 3.18673237e-04 1.54029051e-04 9.40494153e-05
 3.64434043e-05]


### Training a model with PCA

In [55]:
y = wine["Type"]

In [56]:
# Split the transformed X and the y labels into training and test sets
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(transformed_X,y)

# Fit knn to the training data
knn.fit(X_wine_train, y_wine_train)

# Score knn on the test data and print it out
print(knn.score(X_wine_test, y_wine_test))

0.8888888888888888


### Putting it all together

In [57]:
ufo = pd.read_csv('https://assets.datacamp.com/production/repositories/1816/datasets/a5ebfe5d2ed194f2668867603b563963af4769e9/ufo_sightings_large.csv')

In [58]:
ufo.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,11/3/2011 19:21,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,10/3/2004 19:05,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
2,9/25/2009 21:00,coon rapids,mn,us,cigar,0.0,,Green&#44 red&#44 and blue pulses of light tha...,12/12/2009,45.12,-93.2875
3,11/21/2002 05:45,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222
4,8/19/2010 12:55,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333


In [59]:
# Check the column types
print(ufo.dtypes)

# Change the type of seconds to float
ufo["seconds"] = ufo["seconds"].astype('float')

# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo["date"])

# Check the column types
print(ufo[["date", "seconds"]].dtypes)

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object
date       datetime64[ns]
seconds           float64
dtype: object


In [60]:
ufo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4935 entries, 0 to 4934
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            4935 non-null   datetime64[ns]
 1   city            4926 non-null   object        
 2   state           4516 non-null   object        
 3   country         4255 non-null   object        
 4   type            4776 non-null   object        
 5   seconds         4935 non-null   float64       
 6   length_of_time  4792 non-null   object        
 7   desc            4932 non-null   object        
 8   recorded        4935 non-null   object        
 9   lat             4935 non-null   object        
 10  long            4935 non-null   float64       
dtypes: datetime64[ns](1), float64(2), object(8)
memory usage: 424.2+ KB


In [61]:
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo[['length_of_time', 'state', 'type']].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[ufo['length_of_time'].notnull() & 
          ufo['state'].notnull() & 
          ufo['type'].notnull()]

# Print out the shape of the new dataset
print(ufo_no_missing.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4283, 11)


In [70]:
ufo.dropna(inplace=True)

In [71]:
ufo[['length_of_time']]

Unnamed: 0,length_of_time
0,2 weeks
1,30sec.
3,about 5 minutes
4,2
5,10 minutes
...,...
4930,about 5 seconds
4931,25 seconds
4932,early morning
4933,2 hours


In [73]:
def return_minutes(time_string):

    # Use \d+ to grab digits
    pattern = re.compile(r"\d+")
    
    # Use match on the pattern and column
    num = re.match(pattern, time_string)
    if num is not None:
        return int(num.group(0))
        
# Apply the extraction to the length_of_time column
ufo["minutes"] = ufo["length_of_time"].apply(lambda row: return_minutes(row))

# Take a look at the head of both of the columns
print(ufo[["length_of_time", "minutes"]].head(10))
print(ufo[["length_of_time"]])

               length_of_time  minutes
0                     2 weeks      2.0
1                      30sec.     30.0
3             about 5 minutes      NaN
4                           2      2.0
5                  10 minutes     10.0
6   total? maybe around 10 mi      NaN
7   several sightings from 10      NaN
8                   2 minutes      2.0
9                   2 minutes      2.0
10                  5 minutes      5.0
       length_of_time
0             2 weeks
1              30sec.
3     about 5 minutes
4                   2
5          10 minutes
...               ...
4930  about 5 seconds
4931       25 seconds
4932    early morning
4933          2 hours
4934        1 minutes

[3891 rows x 1 columns]


In [74]:
ufo["minutes"].isnull().sum()

449

In [77]:
ufo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3891 entries, 0 to 4934
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            3891 non-null   datetime64[ns]
 1   city            3891 non-null   object        
 2   state           3891 non-null   object        
 3   country         3891 non-null   object        
 4   type            3891 non-null   object        
 5   seconds         3891 non-null   float64       
 6   length_of_time  3891 non-null   object        
 7   desc            3891 non-null   object        
 8   recorded        3891 non-null   object        
 9   lat             3891 non-null   object        
 10  long            3891 non-null   float64       
 11  minutes         3442 non-null   float64       
dtypes: datetime64[ns](1), float64(3), object(8)
memory usage: 395.2+ KB


In [76]:
ufo.dropna().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3442 entries, 0 to 4934
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            3442 non-null   datetime64[ns]
 1   city            3442 non-null   object        
 2   state           3442 non-null   object        
 3   country         3442 non-null   object        
 4   type            3442 non-null   object        
 5   seconds         3442 non-null   float64       
 6   length_of_time  3442 non-null   object        
 7   desc            3442 non-null   object        
 8   recorded        3442 non-null   object        
 9   lat             3442 non-null   object        
 10  long            3442 non-null   float64       
 11  minutes         3442 non-null   float64       
dtypes: datetime64[ns](1), float64(3), object(8)
memory usage: 349.6+ KB


In [78]:
# Check the variance of the seconds and minutes columns
print(ufo[['seconds', 'minutes']].var())

# Log normalize the seconds column
ufo["seconds_log"] = np.log(ufo['seconds'])

# Print out the variance of just the seconds_log column
print(ufo['seconds_log'].var())

seconds    1.691246e+10
minutes    1.020612e+03
dtype: float64
nan


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [82]:
ufo['seconds_log'].var()

nan

In [83]:
ufo.head()

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long,minutes,seconds_log
0,2011-11-03 19:21:00,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111,2.0,14.0058
1,2004-10-03 19:05:00,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556,30.0,3.401197
3,2002-11-21 05:45:00,clemmons,nc,us,triangle,300.0,about 5 minutes,It was a large&#44 triangular shaped flying ob...,12/23/2002,36.0213889,-80.382222,,5.703782
4,2010-08-19 12:55:00,calgary (canada),ab,ca,oval,0.0,2,A white spinning disc in the shape of an oval.,8/24/2010,51.083333,-114.083333,2.0,-inf
5,2012-06-16 23:00:00,san diego,ca,us,light,600.0,10 minutes,Dancing lights that would fly around and then ...,7/4/2012,32.7152778,-117.156389,10.0,6.39693


In [84]:
# Use Pandas to encode us values as 1 and others as 0
ufo["country_enc"] = ufo["country"].apply(lambda val: 1 if val == 'us' else 0 )

# Print the number of unique type values
print(len(ufo['type'].unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo['type'])

# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo, type_set], axis=1)

21


In [108]:
ufo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3442 entries, 0 to 4934
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            3442 non-null   datetime64[ns]
 1   city            3442 non-null   object        
 2   state           3442 non-null   object        
 3   country         3442 non-null   object        
 4   type            3442 non-null   object        
 5   seconds         3442 non-null   float64       
 6   length_of_time  3442 non-null   object        
 7   desc            3442 non-null   object        
 8   recorded        3442 non-null   object        
 9   lat             3442 non-null   object        
 10  long            3442 non-null   float64       
 11  minutes         3442 non-null   float64       
 12  seconds_log     3442 non-null   float64       
 13  country_enc     3442 non-null   int64         
 14  changing        3442 non-null   uint8         
 15  chev

In [109]:
ufo.isnull().sum()

date              0
city              0
state             0
country           0
type              0
seconds           0
length_of_time    0
desc              0
recorded          0
lat               0
long              0
minutes           0
seconds_log       0
country_enc       0
changing          0
chevron           0
cigar             0
circle            0
cone              0
cross             0
cylinder          0
diamond           0
disk              0
egg               0
fireball          0
flash             0
formation         0
light             0
other             0
oval              0
rectangle         0
sphere            0
teardrop          0
triangle          0
unknown           0
month             0
year              0
dtype: int64

In [110]:
ufo.dropna(inplace=True)

In [111]:
# Look at the first 5 rows of the date column
print(ufo['date'].head())

# Extract the month from the date column
ufo["month"] = ufo["date"].apply(lambda row: row.month)

# Extract the year from the date column
ufo["year"] = ufo["date"].apply(lambda row: row.year)

# Take a look at the head of all three columns
print(ufo[['date', 'month', 'year']].head())

0   2011-11-03 19:21:00
1   2004-10-03 19:05:00
4   2010-08-19 12:55:00
5   2012-06-16 23:00:00
8   2013-06-09 00:00:00
Name: date, dtype: datetime64[ns]
                 date  month  year
0 2011-11-03 19:21:00     11  2011
1 2004-10-03 19:05:00     10  2004
4 2010-08-19 12:55:00      8  2010
5 2012-06-16 23:00:00      6  2012
8 2013-06-09 00:00:00      6  2013


In [112]:
# Take a look at the head of the desc field
print(ufo['desc'].head())

# Create the tfidf vectorizer object
vec = TfidfVectorizer()

# Use vec's fit_transform method on the desc field
desc_tfidf = vec.fit_transform(ufo['desc'])

# Look at the number of columns this creates
print(desc_tfidf.shape)

0    Red blinking objects similar to airplanes or s...
1                 Many fighter jets flying towards UFO
4       A white spinning disc in the shape of an oval.
5    Dancing lights that would fly around and then ...
8    Brilliant orange light or chinese lantern at o...
Name: desc, dtype: object
(3442, 4966)


In [105]:
words_to_filter

<function __main__.words_to_filter(vocab, original_vocab, vector, top_n)>