In [1]:
import zipfile
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re

In [2]:
!pip install gensim



In [3]:
localzip ='spoonshot_internship_hiring_ai_challenge-dataset.zip'
zip_ref = zipfile.ZipFile(localzip,'r')
zip_ref.extractall()

In [4]:
Train_data = pd.read_csv("TRAIN.csv")
test_data = pd.read_csv("TEST.csv")

In [5]:
test_data=test_data.set_index(np.arange(17000,20972))

In [6]:
text_data = Train_data[['TITLE','ABSTRACT']]

In [7]:
text_data.head()

Unnamed: 0,TITLE,ABSTRACT
0,ChemGAN challenge for drug discovery: can AI r...,Generating molecules with desired chemical p...
1,Hybrid graphene tunneling photoconductor with ...,Hybrid graphene photoconductor/phototransist...
2,Temperature Dependence of Magnetic Excitations...,When an ordered spin system of a given dimen...
3,A Las Vegas algorithm to solve the elliptic cu...,"In this paper, we describe a new Las Vegas a..."
4,Comparing simulations and test data of a radia...,The VIS instrument on board the Euclid missi...


In [8]:
import nltk
nltk.download('stopwords')
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
total_data = pd.concat([text_data,test_data],0)
total_data.shape

(20972, 2)

In [10]:
test_data.head()

Unnamed: 0,TITLE,ABSTRACT
17000,On the wave propagation analysis and supratran...,"In this research, we investigate the nonline..."
17001,Thermal and non-thermal emission from the coco...,We present hydrodynamic simulations of the h...
17002,HTEM data improve 3D modelling of aquifers in ...,"In Paris Basin, we evaluate how HTEM data co..."
17003,Chirality-induced Antisymmetry in Magnetic Dom...,"In chiral magnetic materials, numerous intri..."
17004,In-home and remote use of robotic body surroga...,People with profound motor deficits could pe...


In [11]:
stopwords = nltk.corpus.stopwords.words('english')
def nlp_preprocessing(total_text, index, column):
    if type(total_text) is not int:
        string = ""
        # replace every special char with space
        total_text = re.sub('[^a-zA-Z0-9\n]', ' ', total_text)
        # replace multiple spaces with single space
        total_text = re.sub('\s+',' ', total_text)
        # converting all the chars into lower-case.
        total_text = total_text.lower()
        for word in total_text.split():
        # if the word is a not a stop word then retain that word from the data
            if not word in stopwords:
                string += word + " "
        total_data[column][index] = string

In [12]:
#clean the data of TITLE
for index, row in total_data.iterrows():
    if type(row['TITLE']) is str:
        nlp_preprocessing(row['TITLE'], index, 'TITLE')
    else:
        print("there is no text description for id:",index)

In [13]:
#clean the data of title
for index, row in total_data.iterrows():
    if type(row['ABSTRACT']) is str:
        nlp_preprocessing(row['ABSTRACT'], index, 'ABSTRACT')
    else:
        print("there is no text description for id:",index)

In [14]:
#stemming words
from nltk.stem.snowball import PorterStemmer , SnowballStemmer
ps = PorterStemmer()
def PorterStemming(col , total_data):
    for i in range(total_data.shape[0]):
        text = total_data[col][i]
        string = " "
        for word in text.split():
            x = ps.stem(word)
            string += x + " "
        total_data[col][i] = string 

In [15]:
PorterStemming('ABSTRACT' , total_data)
PorterStemming('TITLE' , total_data)

In [16]:
from nltk.stem import WordNetLemmatizer
lm=WordNetLemmatizer()
def lemmatizer(col , total_data):
    for i in range(total_data.shape[0]):
        text = total_data[col][i]
        string = " "
        for word in text.split():
            x = lm.lemmatize(word)
            string += x + " "
        total_data[col][i] = string 

In [17]:
lemmatizer('TITLE' , total_data)
lemmatizer('ABSTRACT',total_data)

In [18]:
total_data['TEXT'] = total_data['TITLE'].str.cat(total_data['ABSTRACT'], sep =" ") 

In [19]:
total_data = total_data.drop(['TITLE','ABSTRACT'],1)

In [20]:
total_data.head()

Unnamed: 0,TEXT
0,chemgan challeng drug discoveri ai reproduc n...
1,hybrid graphen tunnel photoconductor interfac...
2,temperatur depend magnet excit terahertz magn...
3,la vega algorithm solv ellipt curv discret lo...
4,compar simul test data radiat damag charg cou...


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(total_data['TEXT'])

In [22]:
X_train = X[0:17000]
X_test = X[17000:]

In [23]:
print(X_train.shape)
print(X_test.shape)

(17000, 39098)
(3972, 39098)


In [24]:
Train_data.columns

Index(['TITLE', 'ABSTRACT', 'Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance', 'labels'],
      dtype='object')

In [25]:
Y = Train_data[['Computer Science', 'Physics', 'Mathematics','Statistics', 'Quantitative Biology', 'Quantitative Finance']]

In [26]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X_train,Y, random_state=42, test_size=0.30, shuffle=True)
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(11900, 39098)
(5100, 39098)
(11900, 6)
(5100, 6)


In [27]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

In [28]:
forest = RandomForestClassifier(random_state=1)
multi_target_forest = MultiOutputClassifier(forest)
multi_target_forest.fit(xtrain, ytrain)

MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                       ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       max_samples=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                  

In [29]:
y_pred = multi_target_forest.predict(xtest)

In [30]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(ytest,y_pred)
print(acc)

0.5982352941176471


In [31]:
xtrain = xtrain.toarray()
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
multi_NB = MultiOutputClassifier(gnb)
multi_NB.fit(xtrain, ytrain)

MultiOutputClassifier(estimator=GaussianNB(priors=None, var_smoothing=1e-09),
                      n_jobs=None)

In [32]:
xtest = xtest.toarray()
y_pred1 = multi_NB.predict(xtest)

In [33]:
acc = accuracy_score(ytest,y_pred1)
print(acc)

0.4419607843137255


In [34]:
y_pred_R = multi_target_forest.predict(X_test) 

In [35]:
pred_df = pd.DataFrame(y_pred_R)

In [36]:

pred_df.columns = ytrain.columns

In [37]:
pred_df.shape

(3972, 6)

In [38]:
pred_df["Result"] = 0

In [39]:
labels = ytrain.columns[:-1]
for i in range(X_test.shape[0]):
  x = []
  for j in labels:
    if pred_df[j][i] == 1:
      x.append(j)
  pred_df["Result"][i] = x 


In [40]:
pred_df['Result'] = pred_df['Result'].apply(lambda x : x[0] if len(x)>0 else 0)

In [42]:
pred_df['Result'].value_counts()

Computer Science    1681
Physics              886
Mathematics          744
0                    516
Statistics           145
Name: Result, dtype: int64

In [43]:
X_test = X_test.toarray()
y_pred_ = multi_NB.predict(X_test)


In [44]:
pred_df1 = pd.DataFrame(y_pred_)
pred_df1.columns = ytrain.columns
pred_df1["Result"] = 0
labels = ytrain.columns[:-1]
for i in range(X_test.shape[0]):
  x = []
  for j in labels:
    if pred_df1[j][i] == 1:
      x.append(j)
  pred_df1["Result"][i] = x 

In [45]:
len(pred_df1['Result'][0])

1

In [46]:
for i in range(pred_df1.shape[0]):
  x= pred_df1['Result'][i]
  if len(x)<1:
    pred_df1['Result'][i]="Physics"
  else:
    pred_df1['Result'][i]=",".join(x)

In [47]:
pred_df1['Result'].value_counts()

Physics                                                                 1253
Computer Science                                                         709
Mathematics                                                              438
Computer Science,Statistics                                              412
Computer Science,Physics,Mathematics,Statistics                          210
Computer Science,Mathematics,Statistics                                  143
Computer Science,Mathematics                                             135
Computer Science,Physics,Statistics                                      121
Statistics                                                               108
Physics,Mathematics                                                      107
Computer Science,Physics                                                  98
Mathematics,Statistics                                                    65
Computer Science,Physics,Mathematics,Statistics,Quantitative Biology      45

In [None]:
data = pred_df1['Result']

In [None]:
data.to_csv("Amreesh.csv")

In [49]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
multi_lg = MultiOutputClassifier(lg)
multi_lg.fit(xtrain, ytrain)

MultiOutputClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                   dual=False,
                                                   fit_intercept=True,
                                                   intercept_scaling=1,
                                                   l1_ratio=None, max_iter=100,
                                                   multi_class='auto',
                                                   n_jobs=None, penalty='l2',
                                                   random_state=None,
                                                   solver='lbfgs', tol=0.0001,
                                                   verbose=0,
                                                   warm_start=False),
                      n_jobs=None)

In [50]:
y_pred2 = multi_lg.predict(xtest)

In [51]:
acc = accuracy_score(ytest,y_pred2)
print(acc)

0.6331372549019608


In [52]:
y_pred_lg = multi_lg.predict(X_test) 

In [53]:
pred_df4 = pd.DataFrame(y_pred_lg)
pred_df4.columns = ytrain.columns
pred_df4["Result"] = 0

In [54]:
labels = ytrain.columns[:-1]
for i in range(X_test.shape[0]):
  x = []
  for j in labels:
    if pred_df4[j][i] == 1:
      x.append(j)
  pred_df4["Result"][i] = x 


In [55]:
pred_df4

Unnamed: 0,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,Result
0,0,1,0,0,0,0,[Physics]
1,0,1,0,0,0,0,[Physics]
2,0,0,0,0,0,0,[]
3,0,1,0,0,0,0,[Physics]
4,1,0,0,0,0,0,[Computer Science]
...,...,...,...,...,...,...,...
3967,0,0,0,1,0,0,[Statistics]
3968,0,0,1,1,0,0,"[Mathematics, Statistics]"
3969,0,0,0,0,0,0,[]
3970,1,0,0,0,0,0,[Computer Science]


In [56]:
for i in range(pred_df4.shape[0]):
  x= pred_df4['Result'][i]
  if len(x)<1:
    pred_df4['Result'][i]="Physics"
  else:
    pred_df4['Result'][i]=",".join(x)

In [59]:
pred_df4['Result'].value_counts()

Physics                                    1317
Computer Science                           1043
Mathematics                                 725
Computer Science,Statistics                 464
Statistics                                  207
Mathematics,Statistics                      119
Computer Science,Mathematics                 52
Physics,Mathematics                          19
Computer Science,Physics                     14
Computer Science,Mathematics,Statistics       8
Quantitative Biology                          2
Statistics,Quantitative Biology               1
Physics,Statistics                            1
Name: Result, dtype: int64

In [60]:
data = pred_df4['Result']
data.to_csv("submission3.csv")

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(random_state=42)
model  = MultiOutputClassifier(clf)
model.fit(xtrain,ytrain)


In [None]:
y_pred5 = multi_lg.predict(xtest)
acc = accuracy_score(ytest,y_pred5)
print(acc)

In [None]:
y_pred_gb = multi_lg.predict(X_test)

In [None]:
pred_df5 = pd.DataFrame(y_pred_gb)
pred_df5.columns = ytrain.columns
pred_df5["Result"] = 0

In [None]:
labels = ytrain.columns[:-1]
for i in range(X_test.shape[0]):
  x = []
  for j in labels:
    if pred_df5[j][i] == 1:
      x.append(j)
  pred_df5["Result"][i] = x 


In [None]:
for i in range(pred_df4.shape[0]):
  x= pred_df5['Result'][i]
  if len(x)<1:
    pred_df5['Result'][i]="Physics"
  else:
    pred_df5['Result'][i]=",".join(x)

In [None]:
data = pred_df5['Result']
data.to_csv("submission5.csv")