In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
%matplotlib inline

import string
import seaborn as sns

import lightgbm as lgb
import xgboost as xgb
from sklearn import ensemble, metrics, model_selection, naive_bayes
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [2]:
data_in = pd.read_csv('data/fj1.csv', dtype={"resolution": str, "gsubname2":str, "gname3":str, "gsubname3":str, "claimmode3_txt":str, "weaptype4_txt":str, "weapsubtype4_txt":str, "divert":str, "kidhijcountry":str})

In [3]:
def positive(x):
    if isinstance(x,str):
        return x
    else:
        return str(0);

In [4]:
nlp_feature = ['country_txt', 'region_txt', 'provstate', 'city', 'alternative_txt', 'attacktype1_txt', 'targtype1_txt', 'motive', 'weaptype1_txt', 'scite1', 'scite2', 'scite3']

for feature in nlp_feature:
    data_in[feature].fillna('N', inplace = True)
    data_in[feature].apply(positive)

In [5]:
data_in['nlp'] = data_in[['country_txt', 'region_txt']].apply(lambda x: ''.join(x), axis=1)
data_in['nlp'] = data_in[['nlp', 'provstate']].apply(lambda x: ''.join(x), axis=1)
data_in['nlp'] = data_in[['nlp', 'city']].apply(lambda x: ''.join(x), axis=1)
data_in['nlp'] = data_in[['nlp', 'alternative_txt']].apply(lambda x: ''.join(x), axis=1)
data_in['nlp'] = data_in[['nlp', 'attacktype1_txt']].apply(lambda x: ''.join(x), axis=1)
data_in['nlp'] = data_in[['nlp', 'targtype1_txt']].apply(lambda x: ''.join(x), axis=1)
data_in['nlp'] = data_in[['nlp', 'motive']].apply(lambda x: ''.join(x), axis=1)
data_in['nlp'] = data_in[['nlp', 'weaptype1_txt']].apply(lambda x: ''.join(x), axis=1)
data_in['nlp'] = data_in[['nlp', 'scite1']].apply(lambda x: ''.join(x), axis=1)
data_in['nlp'] = data_in[['nlp', 'scite2']].apply(lambda x: ''.join(x), axis=1)
data_in['nlp'] = data_in[['nlp', 'scite3']].apply(lambda x: ''.join(x), axis=1)

In [6]:
data = pd.DataFrame({
        'eventid': data_in['eventid'],
        'nlp': data_in['nlp'],
        'gname': data_in['gname']
    })

In [None]:
#data.to_csv('data/nlp_data.csv', index=False)

构造训练集

In [7]:
train = data[data.gname != 'Unknown']

In [None]:
#train.to_csv('data/nlp_train.csv', index=False)

构造测试集

In [8]:
out21 = pd.read_csv('data/21out.csv')
test_in = out21[out21.gname == 'Unknown']
test_in = pd.merge(test_in,data_in,on='eventid',how='left')

In [9]:
test = pd.DataFrame({
        'eventid': test_in['eventid'],
        'nlp': test_in['nlp'],
    })

In [None]:
#test.to_csv('data/nlp_test.csv', index=False)

In [10]:
train.head()

Unnamed: 0,eventid,gname,nlp
0,199801010001,Hutu extremists,BurundiSub-Saharan AfricaBujumbura MairieBujum...
2,199801010003,Loyalist Volunteer Forces (LVF),United KingdomWestern EuropeNorthern IrelandBe...
5,199801040001,Kosovo Liberation Army (KLA),MacedoniaEastern EuropeKumanovo (Municipality)...
6,199801040002,Kosovo Liberation Army (KLA),MacedoniaEastern EuropePrilep (Municipality)Pr...
9,199801050003,Hutu extremists,RwandaSub-Saharan AfricaGitaramaGitaramaNArmed...


In [28]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54681 entries, 0 to 114180
Data columns (total 3 columns):
eventid    54681 non-null int64
gname      54681 non-null object
nlp        54681 non-null object
dtypes: int64(1), object(2)
memory usage: 1.7+ MB


In [11]:
test.head()

Unnamed: 0,eventid,nlp
0,201501010013,PakistanSouth AsiaSindhKarachiOther Crime Type...
1,201501010074,YemenMiddle East & North AfricaMaribUnknownIns...
2,201501020049,Sri LankaSouth AsiaNorth WesternKurunegalaOthe...
3,201501020058,PakistanSouth AsiaKhyber PakhtunkhwaShinawariI...
4,201501020081,AfghanistanSouth AsiaNangarharJalalabadInsurge...


In [30]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2049 entries, 0 to 2048
Data columns (total 2 columns):
eventid    2049 non-null int64
nlp        2049 non-null object
dtypes: int64(1), object(1)
memory usage: 48.0+ KB


模型构建

In [13]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf.fit(data['nlp'])
train_x = tfidf.transform(train['nlp'])
test_x = tfidf.transform(test['nlp'])
print('tfidf prepared !')

tfidf prepared !


In [14]:
train_x

<54681x104128 sparse matrix of type '<class 'numpy.float64'>'
	with 2030384 stored elements in Compressed Sparse Row format>

In [15]:
test_x

<2049x104128 sparse matrix of type '<class 'numpy.float64'>'
	with 57458 stored elements in Compressed Sparse Row format>

In [21]:
#author_mapping_dict = {'Islamic State of Iraq and the Levant (ISIL)':0, 'Taliban':1, 'Al-Shabaab':2, 'Boko Haram':3, 'Houthi extremists (Ansar Allah)':4}
#train_y = train['gname'].map(author_mapping_dict)
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
train_y = enc.fit_transform(train.gname.values)

In [22]:
from sklearn import ensemble, metrics, model_selection, naive_bayes
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(train_x, train_y)
predictions = clf.predict_proba(train_x)

cv_scores = []
cv_scores.append(metrics.log_loss(train_y, predictions))
print("Mean cv score : ", np.mean(cv_scores))
print("Acc :", 1-np.mean(cv_scores))

Mean cv score :  4.086477955674451
Acc : -3.086477955674451


In [24]:
prediction = clf.predict_proba(test_x)
prediction

array([[1.04668355e-10, 1.14934004e-10, 2.45280989e-11, ...,
        2.46308194e-11, 2.40433174e-11, 2.36522531e-11],
       [1.19683438e-08, 1.19649164e-08, 3.10571934e-09, ...,
        3.01081057e-09, 3.01906806e-09, 3.02447983e-09],
       [1.58776788e-06, 1.57038221e-06, 3.96514816e-07, ...,
        3.93000707e-07, 3.93005979e-07, 3.93018251e-07],
       ...,
       [3.64930310e-08, 3.61318193e-08, 8.87180859e-09, ...,
        8.86529719e-09, 8.86727677e-09, 9.07828397e-09],
       [2.38359518e-09, 2.46168672e-09, 5.63708272e-10, ...,
        5.83772584e-10, 5.87990222e-10, 5.80016423e-10],
       [3.91655409e-10, 3.86044593e-10, 9.74725709e-11, ...,
        9.71036239e-11, 9.73520784e-11, 1.03085921e-10]])

In [25]:
out_df = pd.DataFrame(prediction)
out_df.columns = list(enc.classes_)
out_df.insert(0, 'eventid', test['eventid'])
out_df.to_csv("data/out_2_2.csv", index=False)

In [26]:
df = out_df.drop('eventid',axis=1)
pred = df.idxmax(axis=1)

In [27]:
submission = pd.DataFrame({
        "eventid": test['eventid'],
        "gname": pred
    })
submission.to_csv('data/out_2_1.csv', index=False)