* 导入库

In [1]:
import time
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.preprocessing import LabelBinarizer

import lightgbm as lgb
from scipy.sparse import csr_matrix, hstack

In [2]:
train_data = pd.read_csv('train_clean_new.csv', sep="\t")
test_data = pd.read_csv('test.csv',sep='\t')

In [3]:
train_data.head(3)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,Black Lace Slimming Panties Medium NWT,1,Women/Underwear/Panties,,7.0,0,Black Lace Slimming Panties. Size Medium. New ...
1,1,small cat harness and leash,2,Other/Pet Supplies/Others,PetSafe,11.0,0,royal blue. for cats or small pets up to 12 lbs
2,2,VS iPhone case and lanyard,1,Electronics/Cell Phones Accessories/Cases Cov...,,11.0,1,Listing is for one phone case Hard cover case ...


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474710 entries, 0 to 474709
Data columns (total 8 columns):
train_id             474710 non-null int64
name                 474708 non-null object
item_condition_id    474710 non-null int64
category_name        472655 non-null object
brand_name           272297 non-null object
price                474710 non-null float64
shipping             474710 non-null int64
item_description     474586 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 29.0+ MB


* item_description特征缺失76行，将缺item_description的行删除

In [5]:
train_data['item_description'] = train_data['item_description'].fillna('missing').astype(str)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474710 entries, 0 to 474709
Data columns (total 8 columns):
train_id             474710 non-null int64
name                 474708 non-null object
item_condition_id    474710 non-null int64
category_name        472655 non-null object
brand_name           272297 non-null object
price                474710 non-null float64
shipping             474710 non-null int64
item_description     474710 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 29.0+ MB


In [6]:
dele=[]
for i in range(len(train_data['item_description'])):
    if train_data['item_description'][i] == 'missing':
        dele.append(i)

for i in range(len(dele)):
    train_data.drop([dele[i]],inplace=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 474586 entries, 0 to 474709
Data columns (total 8 columns):
train_id             474586 non-null int64
name                 474584 non-null object
item_condition_id    474586 non-null int64
category_name        472533 non-null object
brand_name           272260 non-null object
price                474586 non-null float64
shipping             474586 non-null int64
item_description     474586 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 32.6+ MB


In [7]:
train_data['category_name'].value_counts()[:10]

Women/Athletic Apparel/Pants Tights Leggings                19229
Women/Tops  Blouses/TShirts                                 14778
Beauty/Makeup/Face                                          11002
Beauty/Makeup/Lips                                           9635
Electronics/Video Games  Consoles/Games                      8495
Beauty/Makeup/Eyes                                           8107
Electronics/Cell Phones  Accessories/Cases Covers  Skins     7897
Women/Underwear/Bras                                         6706
Women/Tops  Blouses/Tank Cami                                6559
Women/Tops  Blouses/Blouse                                   6544
Name: category_name, dtype: int64

In [8]:
# 训练集标签
y_train = np.log1p(train_data['price'])

* 缺失值填充

In [9]:
train_data['name'] = train_data['name'].fillna('missing').astype(str)
train_data['category_name'] = train_data['category_name'].fillna('missing').astype(str)
train_data['brand_name'] = train_data['brand_name'].fillna('missing').astype(str)

* 强制类型转换 

In [10]:
train_data['train_id'] = train_data['train_id'].astype(str)
train_data['item_condition_id'] = train_data['item_condition_id'].astype(str)
train_data['shipping'] = train_data['shipping'].astype(str)
train_data['price'] = train_data['price'].astype(str)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 474586 entries, 0 to 474709
Data columns (total 8 columns):
train_id             474586 non-null object
name                 474586 non-null object
item_condition_id    474586 non-null object
category_name        474586 non-null object
brand_name           474586 non-null object
price                474586 non-null object
shipping             474586 non-null object
item_description     474586 non-null object
dtypes: object(8)
memory usage: 32.6+ MB


* PorterStemmer

In [11]:
'''
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
train_data=train_data.applymap(stemmer.stem)
train_data.info()
'''

'\nfrom nltk.stem import PorterStemmer\n\nstemmer = PorterStemmer()\ntrain_data=train_data.applymap(stemmer.stem)\ntrain_data.info()\n'

* 合并训练集和测试集

In [12]:
# 测试集有为空的
df = pd.concat([train_data, test_data], axis=0)
df.info()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


<class 'pandas.core.frame.DataFrame'>
Int64Index: 524586 entries, 0 to 49999
Data columns (total 9 columns):
brand_name           503370 non-null object
category_name        524369 non-null object
item_condition_id    524586 non-null object
item_description     524586 non-null object
name                 524586 non-null object
price                474586 non-null object
shipping             524586 non-null object
test_id              50000 non-null float64
train_id             474586 non-null object
dtypes: float64(1), object(8)
memory usage: 40.0+ MB


* df处理，缺失值填充

In [13]:
df = df.drop(['price', 'test_id', 'train_id'], axis=1)
df['category_name'] = df['category_name'].fillna('MISS').astype(str)
df['brand_name'] = df['brand_name'].fillna('missing').astype(str)
df['item_description'] = df['item_description'].fillna('No')

df['shipping'] = df['shipping'].astype(str)
df['item_condition_id'] = df['item_condition_id'].astype(str)

# 训练数据的行数
nrow_train = train_data.shape[0]

df.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,shipping
0,missing,Women/Underwear/Panties,1,Black Lace Slimming Panties. Size Medium. New ...,Black Lace Slimming Panties Medium NWT,0
1,PetSafe,Other/Pet Supplies/Others,2,royal blue. for cats or small pets up to 12 lbs,small cat harness and leash,0
2,missing,Electronics/Cell Phones Accessories/Cases Cov...,1,Listing is for one phone case Hard cover case ...,VS iPhone case and lanyard,1
3,LuLaRoe,Women/Tops Blouses/Wrap,1,BRAND NEW WITH TAGS,LuLaRoe Medium Shirley,0
4,LuLaRoe,Women/Jeans/Leggings,3,Worn and washed a few times per LLR,OS Leggings GUC,0


In [14]:
df.columns

Index(['brand_name', 'category_name', 'item_condition_id', 'item_description',
       'name', 'shipping'],
      dtype='object')

* CountVectorizer + Tfidf

In [15]:
# name特征向量化，输出稀疏矩阵
cv = CountVectorizer(min_df=10)
X_name = cv.fit_transform(df['name'])

In [16]:
# category_name特征向量化，输出稀疏矩阵
cv = CountVectorizer()
X_category = cv.fit_transform(df['category_name'])

In [17]:
# 利用tfidf向量化item_description属性，使用停词
tv = TfidfVectorizer(max_features=60000, ngram_range=(1, 2), stop_words='english')
X_description = tv.fit_transform(df['item_description'])

In [18]:
# 品牌+名字
brand_cate = df['brand_name']+' '+df['name']
X_brand_cate = tv.fit_transform(brand_cate)

In [19]:
# 描述+名字
des_cate = df['item_description']+' '+df['name']
X_des_cate = tv.fit_transform(des_cate)

In [20]:
# 名称+描述 +
#name_cate = df['name']+' '+df['item_description']
#X_name_item = tv.fit_transform(name_cate)

In [21]:
# 名称+种类+牌子
name_cate = df['name']+' '+df['category_name']+ ' '+df['brand_name']
X_name_cate = tv.fit_transform(name_cate)

In [22]:
X_brand = tv.fit_transform(df['brand_name'])

In [23]:
# 合并item_condition_id，shipping属性形成一个新属性
X_dummies = csr_matrix(pd.get_dummies(df[['item_condition_id', 'shipping']], sparse=True).values)

# 合并各属性，形成一个系数矩阵
sparse_merge = hstack((X_dummies, X_description, X_brand, X_category, X_name, X_des_cate, X_name_cate, X_brand_cate)).tocsr()

* 划分训练集与测试集

In [24]:
X_train_sparse = sparse_merge[:nrow_train]
X_test = sparse_merge[nrow_train:]

In [35]:
X_train_sparse.shape

(474586, 257344)

# Ridge

In [25]:
from sklearn.linear_model import Ridge

In [26]:
def ridgeClassify(train_data, train_label):
    ridgeClf = Ridge(
        solver='auto',
        fit_intercept=True,
        alpha=0.5,
        max_iter=500,
        normalize=False,
        tol=0.05)
    # 训练
    ridgeClf.fit(train_data, train_label)
    return ridgeClf

# 交叉验证

In [27]:
ridgeClf = ridgeClassify(X_train_sparse, y_train)

In [28]:
test_price = ridgeClf.predict(X_test)

In [29]:
true_price = pd.read_csv("label_test.csv", sep="\t").price.tolist()

In [30]:
y_pre_true = np.expm1(test_price)

* 0.241 df['brand_name']+' '+df['category_name']
* 0.24731811722829436 df['item_description']+' '+df['category_name']

* 0.23686654378532468 df['item_description']+' '+df['name']     df['brand_name']+' '+df['category_name']

* 0.24108560740325458 df['item_description']+' '+df['name']     df['item_description']+' '+df['category_name']    df['item_description']+' '+df['item_condition_id']

* 0.23634671796262421 df['item_description']+' '+df['name']     df['brand_name']+' '+df['category_name'] tv brand_new

* 0.23277704301764082 品牌+种类  名称+种类   描述+名字  tv  0.23173446002772038

* 0.23501340567423148 df['name']+' '+df['category_name']+' '+df['item_description']

* 0.22947310231523774 名称+种类+牌子  描述+名字

In [31]:
from sklearn.metrics import mean_squared_log_error
mean_squared_log_error(true_price, y_pre_true)
# 0.22869706937857867

0.160330849591999

In [32]:
print(y_pre_true)

[20.68575045  6.57446259 12.80121958 ... 71.90502733 11.32038824
 30.559119  ]


In [33]:
print(true_price)

[24.0, 4.0, 20.0, 8.0, 41.0, 10.0, 7.0, 131.0, 67.0, 56.0, 36.0, 15.0, 90.0, 20.0, 7.0, 50.0, 31.0, 16.0, 13.0, 11.0, 12.0, 14.0, 20.0, 20.0, 16.0, 5.0, 10.0, 10.0, 22.0, 18.0, 7.0, 14.0, 19.0, 36.0, 38.0, 7.0, 19.0, 69.0, 16.0, 18.0, 35.0, 6.0, 61.0, 50.0, 13.0, 7.0, 291.0, 250.0, 10.0, 42.0, 20.0, 24.0, 15.0, 30.0, 31.0, 27.0, 14.0, 22.0, 24.0, 15.0, 35.0, 18.0, 23.0, 32.0, 9.0, 20.0, 13.0, 12.0, 13.0, 10.0, 19.0, 9.0, 7.0, 16.0, 18.0, 12.0, 299.0, 10.0, 14.0, 7.0, 28.0, 4.0, 48.0, 16.0, 14.0, 36.0, 11.0, 11.0, 54.0, 10.0, 20.0, 44.0, 11.0, 76.0, 41.0, 31.0, 15.0, 14.0, 17.0, 19.0, 24.0, 25.0, 13.0, 4.0, 10.0, 19.0, 26.0, 15.0, 49.0, 15.0, 25.0, 9.0, 54.0, 15.0, 16.0, 34.0, 8.0, 36.0, 68.0, 16.0, 17.0, 5.0, 20.0, 14.0, 33.0, 40.0, 15.0, 18.0, 9.0, 41.0, 26.0, 18.0, 8.0, 51.0, 36.0, 13.0, 14.0, 6.0, 19.0, 7.0, 23.0, 9.0, 16.0, 16.0, 30.0, 36.0, 14.0, 36.0, 56.0, 9.0, 20.0, 6.0, 71.0, 40.0, 5.0, 3.0, 21.0, 5.0, 10.0, 39.0, 6.0, 6.0, 20.0, 23.0, 12.0, 12.0, 10.0, 24.0, 12.0, 26.0, 15.0,

# 写入文件

In [34]:
file = open('Ridge_pre.txt',mode = 'w')
for i in range(len(y_pre_true)):
    s = str(y_pre_true[i])
    s = s+'\n'
    file.write(s)
file.close()