In [1]:
import re
import numpy as np
import pandas as pd
from collections import Counter
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer as TF
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from conf import config

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
train_df.head()

Unnamed: 0,abst,keyword,label
0,"法律介入企业融资的正当性不在于难易,而在于公平.企业有权以各种正当、合法的方式融资,发展普惠...","['公平融资权', '金融市场结构', '竞争性供给', '普惠金融']",1
1,"视频解码是一类最典型的多媒体应用,其计算量大、耗能高.现代多媒体计算平台可利用视频解码计算复...","['线性模型', '视频解码', '计算复杂度', '进行']",0
2,"通过对数值模拟不确定度产生机制的理论分析以及对不确定度从考核区到应用区发展趋势的反演,展示数...","['可靠性认证', '验证与确认', '数值模拟不确定度']",1
3,"邬焜先生在“信息本体论”、“信息认识论”、“信息进化论”的基础上,推导出“信息价值论”.行文...","['物质结构', '信息哲学', '信息结构', '中介粒子场']",1
4,"20世纪80年代以来,世界经济出现了以美国为中心的全球失衡现象,表现为美国的经常项目和投资净...","['国际分工', '金融危机', '美元特权', '美元']",0


In [3]:
def test(row):
    columns = ['abst', 'keyword']
    ans = ','.join(row['keyword']) + row['abst']
    return ans    

In [4]:
train_df['text'] = train_df.apply(lambda row: test(row), axis=1)
test_df['text'] = test_df.apply(lambda row: test(row), axis=1)

In [5]:
test_df['text'].head()

0    [,',大,型,双,驱,龙,门,机,床,',,, ,',H,∞,控,制,',,, ,',龙,...
1    [,',公,证,人,员,',,, ,',理,论,创,新,',,, ,',管,理,部,门,',...
2    [,',承,水,压,',,, ,',耦,合,关,系,',,, ,',渗,透,特,性,',,,...
3    [,',正,交,试,验,',,, ,',乙,酸,酐,',,, ,',木,棉,纤,维,',,,...
4    [,',法,国,学,位,制,度,',,, ,',法,国,工,程,教,育,',,, ,',工,...
Name: text, dtype: object

In [6]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
tfidf = TF(analyzer="word",
           tokenizer=None,
           preprocessor=None,
           stop_words=None,
           max_features=5000)

# 数据向量化
print("Creating the tfidf vector...\n")
tfidf.fit(train_df['text'])
x_train = tfidf.transform(train_df['text'])
x_train = x_train.toarray()

x_test = tfidf.transform(test_df['text'])
x_test = x_test.toarray()

print(x_train.shape)
print(x_test.shape)

Creating the tfidf vector...

(18400, 5000)
(4600, 5000)


In [7]:
y_train = train_df["label"]
# x_train, x_val, fraudulent, y_cal = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

In [8]:
print(Counter(y_train))

Counter({1: 9249, 0: 9151})


In [9]:
model = LR(solver='liblinear')
# model.fit(x_train, y_train)
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
print("10折交叉验证：")
print(np.mean(cross_val_score(model, x_train, y_train, cv=10, scoring="accuracy")))

10折交叉验证：
0.32717391304347826


In [11]:
preds = model.predict(x_test)
submission = pd.DataFrame({'id': range(len(preds)), 'pred': preds})
submission['id'] = submission['id'] + 1
submission.to_csv("../data/ml_submission.csv", index=False, header=False)
submission.head()

Unnamed: 0,id,pred
0,1,1
1,2,0
2,3,1
3,4,0
4,5,1
