In [1]:
# 1)import Packages
import pandas as pd
import numpy as np
import re
import pickle
import sparse_dot_topn.sparse_dot_topn as ct
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
# 2) Load and describe data
Master = pd.read_csv("Master.csv",header=0,sep="|",encoding="ISO-8859-1")
Train = pd.read_csv("Thirdparty_train.csv",header=0,sep="|",encoding="ISO-8859-1")
Test = pd.read_csv("Thirdparty_test.csv",header=0,encoding="ISO-8859-1")

In [14]:
print(Master.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450256 entries, 0 to 450255
Data columns (total 2 columns):
company_id    450256 non-null int64
name          450256 non-null object
dtypes: int64(1), object(1)
memory usage: 6.9+ MB
None


In [15]:
print(Train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
train_index    100000 non-null int64
name           100000 non-null object
company_id     100000 non-null int64
dtypes: int64(2), object(1)
memory usage: 2.3+ MB
None


In [18]:
len(Train)

98788

In [None]:
Train = Train.drop_duplicates(subset="name",keep=False)
Train.to_csv("D:\\Personal\\ING\\STrain2.csv")

In [19]:
### 3) Pre Processing of the data
def removespecial(review_text):
    ## Removes special characters like &,<,>,,,', etc##
    review_text = re.sub(r"\?", " ", str(review_text))
    review_text= re.sub(r'\W+', ' ', str(review_text))
    return review_text

def preparing_Dataset(source_data,train_data, train=1):
    ##Returns Pre-Processed Data
    if train ==1:
        G_Name = source_data["name"].tolist()
        G_Id = source_data["company_id"].tolist()
        source_data["name"] = source_data["name"].apply(removespecial).tolist()
        STrain_Name = train_data["name"].tolist()
        STrain_Id = train_data["company_id"].tolist()
        train_data["name"] = train_data["name"].apply(removespecial).tolist()
        df = pd.concat([source_data[["name"]],train_data[["name"]]],axis =0).reset_index()
        return df["name"],G_Name,G_Id,STrain_Name,STrain_Id

In [20]:
Final_Names,Master_Names,Master_Id,Train_names,Train_Id = preparing_Dataset(Master,Train)

In [3]:
## 4) Vectorization of the Data
def ngrams(text, n=3):
    ## ngram generation
    text = re.sub(r'[,-./]|\sBD',r'', text)
    ngram = zip(*[text[i:] for i in range(n)])
    return [''.join(ng) for ng in ngram]

def TFIDF(texts,G_count,S_count):
    ## TFIDF vector creation
    tfidf_vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    print(tfidf_matrix.shape)
    print(G_count+S_count)
    return tfidf_matrix[0:G_count,:],tfidf_matrix[G_count:G_count+S_count,:]

In [22]:
Master_Matrix,Train_Matrix = TFIDF(Final_Names,len(Master),len(Train))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(549044, 67923)
549044


In [23]:
print(Master_Matrix.shape)

(450256, 67923)


In [24]:
print(Train_Matrix.shape)

(98788, 67923)


In [25]:
## Cosine Distance Calculation

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    B = B.tocsr()

    M, K1 = A.shape
    K2, N = B.shape

    idx_dtype = np.int32

    nnz_max = M*ntop

    indptr = np.empty(M+1, dtype=idx_dtype)
    indices = np.empty(nnz_max, dtype=idx_dtype)
    data = np.empty(nnz_max, dtype=A.dtype)
    print("temp")
    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)
    print("temp")
    return csr_matrix((data,indices,indptr),shape=(M,N))

In [26]:

Cosine_Matrix = awesome_cossim_top(Train_Matrix,Master_Matrix.transpose(),1,0)
with open("Cosine_Matrix.pickle", 'wb') as handle:
    pickle.dump(Cosine_Matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

temp
temp


In [27]:
### 5) Matrix to Data Frame

def get_Final_Data(cosine_Matrix,G_CompanyName,G_CompanyId,STrain_CompanyName,STrain_CompanyId):
    no_zero = cosine_Matrix.nonzero()
    sparse_row = no_zero[0]
    sparse_col = no_zero[1]

    # if ntop:
    #     nr_match = ntop
    # else:
    nr_match = sparse_col.size
    Strain_data = np.empty([nr_match],dtype=object)
    Strain_companyId = np.empty([nr_match], dtype=object)
    G_data = np.empty([nr_match], dtype=object)
    G_companyId = np.empty([nr_match], dtype=object)
    score = np.zeros(nr_match)
    for index in range(0, nr_match):
        Strain_data[index] = STrain_CompanyName[sparse_row[index]]
        Strain_companyId[index] = STrain_CompanyId[sparse_row[index]]
        G_data[index] = G_CompanyName[sparse_col[index]]
        G_companyId[index] = G_CompanyId[sparse_col[index]]
        score[index] = cosine_Matrix.data[index]

    return pd.DataFrame({"Strain_data":Strain_data,
                         "Strain_CompanyId":Strain_companyId,
                         "G_data":G_data,
                         "G_CompanyId":G_companyId,
                         "score":score})

In [28]:
print(Cosine_Matrix.shape)
final = get_Final_Data(Cosine_Matrix,Master_Names,Master_Id,Train_names,Train_Id)
final['output']=final.apply(lambda x: 1 if (x['G_CompanyId']==x['strain_CompanyId']) else 0, axis=1)
final.to_csv("final.csv")

(98788, 450256)


In [29]:
## 6) Classifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

In [30]:
final = pd.read_csv("final.csv",header=0,encoding="ISO-8859-1")
final_train,final_test = train_test_split(final,test_size=0.30,random_state=2)
LRC = LogisticRegression(C=10, penalty='l2')
LRC.fit(final_train[["score"]],final_train[['output']])

# clf2 = svm.LinearSVC(C=1.,dual=False,loss='l2', penalty='l2')
# clf2.fit(final_train[["score"]],final_train[['output']])

# clf3 = RandomForestClassifier(bootstrap ='True',max_depth=6,max_features='sqrt',n_estimators=50)
# clf3.fit(final_train[["score"]],final_train[['output']])

  y = column_or_1d(y, warn=True)


LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [31]:
# 7) Prediction
predict = LRC.predict(final_test[['score']])

In [32]:
## 8) Model Validation
# using cross tab
pd.crosstab(final_test['output'],predict, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9947,3202
1,3167,13219


In [33]:
# 9) Cross validation to check whether the model will behave in a same way for new data
cross_val_score(LRC,final_train[["score"]],final_train['output'],cv=20)

array([0.79286336, 0.7827096 , 0.77835799, 0.79315347, 0.78670923,
       0.77713291, 0.78119559, 0.78612885, 0.78670923, 0.78961114,
       0.77822932, 0.78432511, 0.78984035, 0.78635704, 0.78258345,
       0.78664731, 0.78838897, 0.77997097, 0.79419448, 0.77474601])

In [34]:
# 10) Precison - Recall Matrix
from sklearn.metrics import classification_report
report = classification_report(final_test['output'],predict)
print(report)

             precision    recall  f1-score   support

          0       0.76      0.76      0.76     13149
          1       0.81      0.81      0.81     16386

avg / total       0.78      0.78      0.78     29535



In [35]:
# 11) Persisting the model
pickle.dump(LRC,open('finalized_model.sav','wb'))