In [61]:
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

from textblob import TextBlob

In [62]:
# read sample data
sample_df = pd.read_csv("../../data/sample_data.csv")
sample_df = sample_df[['train_id', 'name']]
sample_df.head(3)

Unnamed: 0,train_id,name
0,748447,Polaroid Camera
1,674932,Vintage Ashtray
2,586672,Matilda Jane weekender nwt


# Extract Information about Text

## 1. Counts
    - Number of Upper Words / Characters
    - Number of Words before text cleaning
    - Number of Characters before text cleaning

In [63]:
sample_df['upper_word_count'] = sample_df.name.apply(lambda x: len([x for x in x.split() if x.isupper()]))
sample_df['upper_char_count'] = sample_df.name.apply(lambda x: len([x for x in x if x.isupper()]))
sample_df['bef_word_count'] = sample_df.name.apply(lambda x: len(x.split()))
sample_df['bef_char_count'] = sample_df.name.apply(lambda x: len(x))                                                    
sample_df.head(5)

Unnamed: 0,train_id,name,upper_word_count,upper_char_count,bef_word_count,bef_char_count
0,748447,Polaroid Camera,0,2,2,15
1,674932,Vintage Ashtray,0,2,2,15
2,586672,Matilda Jane weekender nwt,0,2,4,26
3,846012,Red Full Lace Wig,0,4,4,17
4,1026408,EQUATE NICOTINE TRANSDERMAL SYSTEM,4,31,4,34


## 2. Count Number of Stop words

In [64]:
stop = stopwords.words('english')
sample_df['stopword_count'] = sample_df.name.apply(lambda x: len([x for x in x.split() if x.lower() in stop]))
sample_df.stopword_count.max()

3

## 3. Count Number of Punctuations

In [65]:
sample_df['punctuation_count'] = sample_df.name.apply(lambda x: len([x for x in x if x in string.punctuation]))
sample_df.punctuation_count.max()

8

## 4. Count Number of Numerical Characters

In [66]:
sample_df['number_count'] = sample_df.name.apply(lambda x: len([x for x in x.split() if x.isdigit()]))
sample_df.number_count.max()

3

## 5. Average Word Length

In [67]:
sample_df['bef_avg_word_len'] = sample_df['bef_char_count'] / sample_df['bef_word_count']
sample_df.head(5)

Unnamed: 0,train_id,name,upper_word_count,upper_char_count,bef_word_count,bef_char_count,stopword_count,punctuation_count,number_count,bef_avg_word_len
0,748447,Polaroid Camera,0,2,2,15,0,0,0,7.5
1,674932,Vintage Ashtray,0,2,2,15,0,0,0,7.5
2,586672,Matilda Jane weekender nwt,0,2,4,26,0,0,0,6.5
3,846012,Red Full Lace Wig,0,4,4,17,0,0,0,4.25
4,1026408,EQUATE NICOTINE TRANSDERMAL SYSTEM,4,31,4,34,0,0,0,8.5


# Text Cleaning

## 1. Make All Text Lower Case

In [75]:
sample_df['clean_name'] = sample_df.name.apply(lambda x: x.lower() )
sample_df.clean_name.head(3)

0               polaroid camera
1               vintage ashtray
2    matilda jane weekender nwt
Name: clean_name, dtype: object

## 2.  Removing Punctuations

In [76]:
sample_df.clean_name = sample_df.clean_name.apply(
    lambda x: x.translate(str.maketrans('', '', string.punctuation)))

## 3. Removing Stopwords

In [77]:
sample_df.clean_name = sample_df.clean_name.apply(lambda x: " ".join(x for x in x.split() if x not in stop))

## 4. Correct Spelling 

In [78]:
sample_df.clean_name = sample_df.clean_name.apply(lambda x: str(TextBlob(x).correct()))

## 5. Lemmatization
- converting a word to its base form.

In [79]:
import ssl
ssl._create_default_https_context = _create_unverified_https_context

# # Download Wordnet
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /Users/jinchen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [80]:
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()
sample_df.clean_name = sample_df.clean_name.apply(lambda x: lemmatizer.lemmatize(x))

In [81]:
sample_df.head(5)

Unnamed: 0,train_id,name,upper_word_count,upper_char_count,bef_word_count,bef_char_count,stopword_count,punctuation_count,number_count,bef_avg_word_len,clean_name
0,748447,Polaroid Camera,0,2,2,15,0,0,0,7.5,polaroid camera
1,674932,Vintage Ashtray,0,2,2,15,0,0,0,7.5,vantage astray
2,586672,Matilda Jane weekender nwt,0,2,4,26,0,0,0,6.5,manila jane weekend not
3,846012,Red Full Lace Wig,0,4,4,17,0,0,0,4.25,red full lace wig
4,1026408,EQUATE NICOTINE TRANSDERMAL SYSTEM,4,31,4,34,0,0,0,8.5,quite nicotine transdermal system


## Extract Information about Text After Text Cleaning
- Number of Words after text cleaning
- Number of Characters after text cleaning
- Avg word length after text cleaning

In [82]:
sample_df['aft_word_count'] = sample_df.clean_name.apply(lambda x: len(x.split()))
sample_df['aft_char_count'] = sample_df.clean_name.apply(lambda x: len(x)) 
sample_df['aft_avg_word_len'] = sample_df['aft_char_count'] / sample_df['aft_word_count']
sample_df.head(5)

Unnamed: 0,train_id,name,upper_word_count,upper_char_count,bef_word_count,bef_char_count,stopword_count,punctuation_count,number_count,bef_avg_word_len,clean_name,aft_word_count,aft_char_count,aft_avg_word_len
0,748447,Polaroid Camera,0,2,2,15,0,0,0,7.5,polaroid camera,2,15,7.5
1,674932,Vintage Ashtray,0,2,2,15,0,0,0,7.5,vantage astray,2,14,7.0
2,586672,Matilda Jane weekender nwt,0,2,4,26,0,0,0,6.5,manila jane weekend not,4,23,5.75
3,846012,Red Full Lace Wig,0,4,4,17,0,0,0,4.25,red full lace wig,4,17,4.25
4,1026408,EQUATE NICOTINE TRANSDERMAL SYSTEM,4,31,4,34,0,0,0,8.5,quite nicotine transdermal system,4,33,8.25


In [83]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   train_id           1000 non-null   int64  
 1   name               1000 non-null   object 
 2   upper_word_count   1000 non-null   int64  
 3   upper_char_count   1000 non-null   int64  
 4   bef_word_count     1000 non-null   int64  
 5   bef_char_count     1000 non-null   int64  
 6   stopword_count     1000 non-null   int64  
 7   punctuation_count  1000 non-null   int64  
 8   number_count       1000 non-null   int64  
 9   bef_avg_word_len   1000 non-null   float64
 10  clean_name         1000 non-null   object 
 11  aft_word_count     1000 non-null   int64  
 12  aft_char_count     1000 non-null   int64  
 13  aft_avg_word_len   1000 non-null   float64
dtypes: float64(2), int64(10), object(2)
memory usage: 109.5+ KB


In [84]:
sample_df.to_csv("../../data/sample_data_item_name_features.csv")