In [1]:
import time
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import Ridge

from sklearn.pipeline import FeatureUnion

In [2]:
train_data = pd.read_csv('train.csv', sep="\t")
test_data = pd.read_csv('test.csv',sep='\t')

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 8 columns):
train_id             300000 non-null int64
name                 300000 non-null object
item_condition_id    300000 non-null int64
category_name        298719 non-null object
brand_name           171929 non-null object
price                300000 non-null float64
shipping             300000 non-null int64
item_description     300000 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 18.3+ MB


In [4]:
train_data.head(15)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,Blue crystal Heisenberg Funko pop!,3,Vintage & Collectibles/Toy/Action Figure,Funko,43.0,1,Minor flaws check pic for condition Bundle to ...
1,1,Funko Pop STAR WARS EMPEROR PALPATINE NM,2,Kids/Toys/Action Figures & Statues,Funko,33.0,1,Near mint 9/10
2,2,Black Otter Box iPhone 6 / 6s,3,"Electronics/Cell Phones & Accessories/Cases, C...",,14.0,0,Excellent condition black Otter Box phone case...
3,3,32ft Red lights,1,Home/Home Décor/Lamps& Accessories,,7.0,1,No description yet
4,4,Dress,3,"Women/Dresses/Above Knee, Mini",FOREVER 21,22.0,0,Short dress with Aztec design on the front and...
5,5,Bundle do not buy,3,Kids/Girls 0-24 Mos/One-Pieces,Disney,39.0,0,Girls sleeping beauty set 9 months
6,6,UGG wedge,3,Women/Shoes/Sandals,UGG Australia,31.0,0,No description yet
7,7,ON HOLD FOR ANNA!,1,Kids/Girls 0-24 Mos/One-Pieces,,26.0,0,Purchasing all as one bundle 3 zip up sleepers...
8,8,My Little Pony Castle,1,Kids/Toys/Dolls & Accessories,My Little Pony,25.0,1,Thank You for taking the time to look at my li...
9,9,Floral rug *New*,1,Home/Home Décor/Area Rugs & Pads,,35.0,0,5x7 Bright/happy floral colored rug never been...


In [5]:
train_data.isnull().any()

train_id             False
name                 False
item_condition_id    False
category_name         True
brand_name            True
price                False
shipping             False
item_description     False
dtype: bool

In [6]:
#计算各属性值的缺失百分比
total=train_data.isnull().sum().sort_values(ascending=False)#计算每个属性缺失数据个数，并排序
percent=(train_data.isnull().sum()/train_data.isnull().count()).sort_values(ascending=False)#计算每个属性缺失数据百分比，并排序
missing_data=pd.concat([total,percent],axis=1,keys=['Total','Percent'])
missing_data

Unnamed: 0,Total,Percent
brand_name,128071,0.426903
category_name,1281,0.00427
item_description,0,0.0
shipping,0,0.0
price,0,0.0
item_condition_id,0,0.0
name,0,0.0
train_id,0,0.0


In [7]:
def featureProcessing(df):

    # delete the data that will not be used
    df = df.drop(['price', 'train_id'], axis=1)
    # deal with the missing value with a default value
    df['category_name'] = df['category_name'].fillna('MISS').astype(str)
    df['brand_name'] = df['brand_name'].fillna('missing').astype(str)
    df['item_description'] = df['item_description'].fillna('No')
    # convert the data : int -> str
    df['shipping'] = df['shipping'].astype(str)
    df['item_condition_id'] = df['item_condition_id'].astype(str)

    return df

In [8]:
train_data=featureProcessing(train_data)
train_data

Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description
0,Blue crystal Heisenberg Funko pop!,3,Vintage & Collectibles/Toy/Action Figure,Funko,1,Minor flaws check pic for condition Bundle to ...
1,Funko Pop STAR WARS EMPEROR PALPATINE NM,2,Kids/Toys/Action Figures & Statues,Funko,1,Near mint 9/10
2,Black Otter Box iPhone 6 / 6s,3,"Electronics/Cell Phones & Accessories/Cases, C...",missing,0,Excellent condition black Otter Box phone case...
3,32ft Red lights,1,Home/Home Décor/Lamps& Accessories,missing,1,No description yet
4,Dress,3,"Women/Dresses/Above Knee, Mini",FOREVER 21,0,Short dress with Aztec design on the front and...
5,Bundle do not buy,3,Kids/Girls 0-24 Mos/One-Pieces,Disney,0,Girls sleeping beauty set 9 months
6,UGG wedge,3,Women/Shoes/Sandals,UGG Australia,0,No description yet
7,ON HOLD FOR ANNA!,1,Kids/Girls 0-24 Mos/One-Pieces,missing,0,Purchasing all as one bundle 3 zip up sleepers...
8,My Little Pony Castle,1,Kids/Toys/Dolls & Accessories,My Little Pony,1,Thank You for taking the time to look at my li...
9,Floral rug *New*,1,Home/Home Décor/Area Rugs & Pads,missing,0,5x7 Bright/happy floral colored rug never been...


In [9]:
train_data.dtypes

name                 object
item_condition_id    object
category_name        object
brand_name           object
shipping             object
item_description     object
dtype: object

In [10]:
from nltk.stem import PorterStemmer

In [11]:
stemmer = PorterStemmer()
train_data=train_data.applymap(stemmer.stem)
train_data
# train_data.applymap(stemmer.stem)

Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description
0,blue crystal heisenberg funko pop!,3,vintage & collectibles/toy/action figur,funko,1,minor flaws check pic for condition bundle to ...
1,funko pop star wars emperor palpatine nm,2,kids/toys/action figures & statu,funko,1,near mint 9/10
2,black otter box iphone 6 / 6,3,"electronics/cell phones & accessories/cases, c...",miss,0,excellent condition black otter box phone case...
3,32ft red light,1,home/home décor/lamps& accessori,miss,1,no description yet
4,dress,3,"women/dresses/above knee, mini",forever 21,0,short dress with aztec design on the front and...
5,bundle do not buy,3,kids/girls 0-24 mos/one-piec,disney,0,girls sleeping beauty set 9 month
6,ugg wedg,3,women/shoes/sand,ugg australia,0,no description yet
7,on hold for anna!,1,kids/girls 0-24 mos/one-piec,miss,0,purchasing all as one bundle 3 zip up sleepers...
8,my little pony castl,1,kids/toys/dolls & accessori,my little poni,1,thank you for taking the time to look at my li...
9,floral rug *new*,1,home/home décor/area rugs & pad,miss,0,5x7 bright/happy floral colored rug never been...


In [12]:
train_data['category_name'].nunique()

1115

In [13]:
train_data['category_name'].value_counts()[:10]

women/athletic apparel/pants, tights, leg                     12028
women/tops & blouses/t-shirt                                   9426
beauty/makeup/fac                                              6917
beauty/makeup/lip                                              5968
electronics/video games & consoles/gam                         5339
beauty/makeup/ey                                               5160
electronics/cell phones & accessories/cases, covers & skin     5051
women/underwear/bra                                            4410
women/tops & blouses/tank, cami                                4133
women/tops & blouses/blous                                     4105
Name: category_name, dtype: int64

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()


In [20]:
# pd.DataFrame(train_data['item_description']).applymap(vectorizer.fit_transform())
# vectorizer.fit_transform(corpus)
descrip_result=vectorizer.fit_transform(train_data['item_description'])
descrip_result

<300000x74675 sparse matrix of type '<class 'numpy.float64'>'
	with 6186867 stored elements in Compressed Sparse Row format>

In [25]:
# descrip_result.toarray(1)
print(descrip_result)

  (0, 45066)	0.44927260555146004
  (0, 28955)	0.39156418585883557
  (0, 17770)	0.345460874806672
  (0, 51606)	0.44177031768182273
  (0, 29467)	0.19078386666546296
  (0, 19822)	0.22878594925310367
  (0, 15675)	0.2836223877885039
  (0, 67550)	0.206585172960995
  (0, 58674)	0.3441900605031678
  (1, 47087)	0.7039179080438623
  (1, 45076)	0.5737172526772232
  (1, 349)	0.41874585695341743
  (2, 29467)	0.07256567066693646
  (2, 19822)	0.08701996734259802
  (2, 15675)	0.10787730192154912
  (2, 67550)	0.07857578257414283
  (2, 27040)	0.13352814987711947
  (2, 13574)	0.10183765855515256
  (2, 49532)	0.2635644833049926
  (2, 14688)	0.11301003579679939
  (2, 51497)	0.32438702057770874
  (2, 16983)	0.28358532685813365
  (2, 36816)	0.1503724679103935
  (2, 9601)	0.06254533982559227
  (2, 6134)	0.17366936462968624
  :	:
  (299998, 21629)	0.17021654908704364
  (299998, 51860)	0.1539854929782087
  (299998, 12464)	0.27071986644345336
  (299998, 40089)	0.17586915401611525
  (299998, 22212)	0.215000317035

In [27]:
vectorizer.fit_transform(['sparse matrix of type class numpy.float64 with 6186867 stored elements in Compressed Sparse Row format>']).toarray()

array([[0.23570226, 0.23570226, 0.23570226, 0.23570226, 0.23570226,
        0.23570226, 0.23570226, 0.23570226, 0.23570226, 0.23570226,
        0.23570226, 0.47140452, 0.23570226, 0.23570226, 0.23570226]])