In [1]:
# -*- encoding:utf-8 -*-
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.externals import joblib
import pickle
import pandas as pd

## 训练集

In [2]:
df = pd.read_csv('./kesci/train.csv', lineterminator='\n')
df.head()

Unnamed: 0,ID,review,label
0,1,Jo bhi ap se tou behtar hoon,Negative
1,2,ya Allah meri sister Affia ki madad farma,Positive
2,3,Yeh khud chahta a is umar main shadi krna. ha...,Negative
3,4,Tc ? Apky mun xe exe alfax achy nae lgty 😒💃,Negative
4,5,Good,Positive


In [3]:
print(len(df['label']))
type(df['label'])

6328


pandas.core.series.Series

In [4]:
print('Positive: ', len(df[df['label']=='Positive']))
print('Negative: ', len(df[df['label']=='Negative']))

Positive:  3361
Negative:  2967


## 测试集

In [5]:
df_test = pd.read_csv('./kesci/20190506_test.csv', lineterminator='\n')
print(len(df_test['review']))
df_test.head()

2712


Unnamed: 0,ID,review
0,1,Hum logo ny 70 salo ma itna loan ni lia jitna ...
1,2,Us dor ke mushahir ke sath us ke gehre taluqat...
2,3,"Is ke ilawa ap ki nazmon, geeton aur ghazlon ..."
3,4,Iss mein koi burai nahi
4,5,Iss ke baad azadi ki jaddojehad mein congress...


## 模型

In [6]:
def load_data(type='train'):
    """
    对数据进行处理：所有字符转为小写，标签转为 0 和 1 
    """
    if type=='train':
        data = [review.lower() for review in df['review']]  # 转小写
        labels = [1 if label=='Positive' else 0 for label in df['label']]  # Positive转成1，Negative转成0
        return data, labels
    else:
        data = [review.lower() for review in df_test['review']]
        return data

def train_tfidf(train_data):
    """
    sklearn 中的 TFIDF 模型
    """
    tfidf = TFIDF(min_df=5, max_features=5000, ngram_range=(1,3), use_idf=1, smooth_idf=1)  # 0.85030136
    tfidf.fit(train_data)
    return tfidf

def train_SVC(data_vec, label):
    """
    sklearn 中的 LinearSVC 模型
    """
    SVC = LinearSVC()
    clf = CalibratedClassifierCV(SVC) 
    clf.fit(data_vec, label)
    return clf

In [7]:
def train():
    train_data, labels = load_data('train')  # 对数据进行处理：所有字符转为小写，标签转为 0 和 1 
    
    tfidf = train_tfidf(train_data)
    train_vec = tfidf.transform(train_data)
    model = train_SVC(train_vec, labels)
    
    print('model saving...')
    joblib.dump(tfidf, 'SVCmodel/tfidf.model')
    joblib.dump(model, 'SVCmodel/svc.model')

def predict():
    test_data = load_data('test')
    print('load model...')
    tfidf = joblib.load('SVCmodel/tfidf.model')
    model = joblib.load('SVCmodel/svc.model')
    print('predict...')
    test_vec = tfidf.transform(test_data)
    test_predict = model.predict_proba(test_vec)
    return test_predict

def train_no_save_model():
    """
    train and predict (without saving model)
    """
    train_data, labels = load_data('train')  # 对数据进行处理：所有字符转为小写，标签转为 0 和 1 
    
    tfidf = train_tfidf(train_data)
    train_vec = tfidf.transform(train_data)
    model = train_SVC(train_vec, labels)
    
    test_data = load_data('test')
    test_vec = tfidf.transform(test_data)
    test_predict = model.predict_proba(test_vec)
    return test_predict

In [8]:
# train() 
# test_predict = predict() # 保存模型预测
test_predict = train_no_save_model()
test_predict_positive = [item[1] for item in test_predict]
print(test_predict[:5])

[[0.70528446 0.29471554]
 [0.27158667 0.72841333]
 [0.17670025 0.82329975]
 [0.6589864  0.3410136 ]
 [0.41008157 0.58991843]]




In [9]:
# 写入预测文件，提交结果
test_ids = df_test['ID']
Data = {'ID':test_ids, 'Pred':test_predict_positive}
pd.DataFrame(Data, columns=['ID', 'Pred']).to_csv('test_pred_20190506.csv', header=True) #写入文件
print('Done')

Done
