# Reading Dataset

In [1]:
# Read dataset using pandas.
import pandas as pd 

data = pd.read_csv('dataset3.csv')

data.head()

Unnamed: 0.1,Unnamed: 0,input,target
0,0,New and new technology has been introduced to...,New technology has been introduced to society .
1,1,New and new technology has been introduced to...,New technology has been introduced into the so...
2,2,New and new technology has been introduced to...,Newer and newer technology has been introduced...
3,3,New and new technology has been introduced to...,Newer and newer technology has been introduced...
4,4,One possible outcome is that an environmental...,One possible outcome is that an environmentall...


In [2]:
df1 = pd.DataFrame(data['input'])
df2 = pd.DataFrame(data['target'])

In [3]:
df1.insert(1, 'Target', 0, True)
df2.insert(1, 'Target', 1, True )

In [4]:
df2.rename(columns = {'target': 'input'}, inplace = True)

In [5]:
df1.head()

Unnamed: 0,input,Target
0,New and new technology has been introduced to...,0
1,New and new technology has been introduced to...,0
2,New and new technology has been introduced to...,0
3,New and new technology has been introduced to...,0
4,One possible outcome is that an environmental...,0


In [6]:
df2.head()

Unnamed: 0,input,Target
0,New technology has been introduced to society .,1
1,New technology has been introduced into the so...,1
2,Newer and newer technology has been introduced...,1
3,Newer and newer technology has been introduced...,1
4,One possible outcome is that an environmentall...,1


In [111]:
dff = pd.concat([df1, df2], axis = 0)

print(len(dff))
dff.head()

12008


Unnamed: 0,input,Target
0,New and new technology has been introduced to...,0
1,New and new technology has been introduced to...,0
2,New and new technology has been introduced to...,0
3,New and new technology has been introduced to...,0
4,One possible outcome is that an environmental...,0


In [8]:
dff.dtypes

input     object
Target     int64
dtype: object

# Preprocessing in Dataset

In [9]:
# Make mask to check if have Missing values.
mask = dff.isnull().any(axis = 1)

data_cleaning = dff[~mask]

In [10]:
print(len(dff))
print(len(data_cleaning))

12008
12008


## Splitting dataset into Input and Output

In [11]:

# Splitting dataset to input and output.
data_input = dff['input']
data_output = dff['Target']

data_input.head()

0     New and new technology has been introduced to...
1     New and new technology has been introduced to...
2     New and new technology has been introduced to...
3     New and new technology has been introduced to...
4     One possible outcome is that an environmental...
Name: input, dtype: object

In [12]:
# Show the output.
data_output.head()

0    0
1    0
2    0
3    0
4    0
Name: Target, dtype: int64

In [14]:
import numpy as np

data_input = np.array(data_input)
data_output = np.array(data_output)

In [16]:
print(data_input[0])
print(data_input[6005])

 New and new technology has been introduced to the society .
New technology has been introduced into the society .


## Tokenization And Stemming

In [17]:
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()
import string
stem_token=[]

stopwords = stopwords.words('english')

for sen in tqdm(data_input):
    # Make Tokenizion and Stemming.
    token = word_tokenize(sen)
    tokens_stem = [ps.stem(word) for word in token]
    
    # Delete punctuation and stop words.
    sen = ' '.join(i for i in tokens_stem if i not in string.punctuation and i not in stopwords)
    stem_token.append(sen)

100%|██████████████████████████████████████████████████████████████████████████| 12008/12008 [00:09<00:00, 1224.09it/s]


In [18]:
# Show the first en item of list.
stem_token[0:10]

['new new technolog ha introduc societi',
 'new new technolog ha introduc societi',
 'new new technolog ha introduc societi',
 'new new technolog ha introduc societi',
 'one possibl outcom environmentally-induc reduct motor level richer countri outweigh ani rise motor level poorer countri',
 'one possibl outcom environmentally-induc reduct motor level richer countri outweigh ani rise motor level poorer countri',
 'one possibl outcom environmentally-induc reduct motor level richer countri outweigh ani rise motor level poorer countri',
 'one possibl outcom environmentally-induc reduct motor level richer countri outweigh ani rise motor level poorer countri',
 'everi person need know bit math scienc art literatur histori order stand societi',
 'everi person need know bit math scienc art literatur histori order stand societi']

## POS(Part Of Speech)

In [19]:
import nltk

# Using POS ==> Part Of Speech.
tagged = nltk.pos_tag(stem_token)

# Show the first ten item of tagged.
tagged[0:10]

[('new new technolog ha introduc societi', 'JJ'),
 ('new new technolog ha introduc societi', 'JJ'),
 ('new new technolog ha introduc societi', 'JJ'),
 ('new new technolog ha introduc societi', 'JJ'),
 ('one possibl outcom environmentally-induc reduct motor level richer countri outweigh ani rise motor level poorer countri',
  'JJ'),
 ('one possibl outcom environmentally-induc reduct motor level richer countri outweigh ani rise motor level poorer countri',
  'JJ'),
 ('one possibl outcom environmentally-induc reduct motor level richer countri outweigh ani rise motor level poorer countri',
  'JJ'),
 ('one possibl outcom environmentally-induc reduct motor level richer countri outweigh ani rise motor level poorer countri',
  'JJ'),
 ('everi person need know bit math scienc art literatur histori order stand societi',
  'NN'),
 ('everi person need know bit math scienc art literatur histori order stand societi',
  'NN')]

## Using spacy Library

In [20]:
import numpy as np
import spacy
import string
nlp = spacy.load('en_core_web_sm')

text = data_input[0]

for token in nlp(text):
    print(token.text, '=>', token.dep_,'=>',token.head.text)

  => dep => New
New => amod => technology
and => cc => New
new => conj => New
technology => nsubjpass => introduced
has => aux => introduced
been => auxpass => introduced
introduced => ROOT => introduced
to => prep => introduced
the => det => society
society => pobj => to
. => punct => introduced


# Visualization Dataset Using Spacy

In [21]:
# Make visualization to data to see relationship.
from spacy import displacy
displacy.render(nlp(data_input[0]), jupyter = True, style = 'dep', options = {'distance': 90})

In [22]:
# See another way to visualization data.
from spacy import displacy
displacy.render(nlp(data_input[0]), jupyter = True, style = 'dep', options = {'compact': True, 'bg': '#09a3d5',
                                                                           'distance': 90, 'color': 'white',
                                                                           'font': 'Pacifico Regular'
                                                                           })

In [112]:
# Use another way to visulaize data about using NER.
text = data_input[0]
doc = nlp(text)

displacy.render(doc, style = 'ent', jupyter = True)

KeyError: 0

In [24]:
text = data_input[100]
doc = nlp(text)

displacy.render(doc, style = 'dep', jupyter = True, options = {'distance': 60})

# Extraction Dataset Using TF-IDF

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
tfidf_vec_fit = tfidf_vec.fit(stem_token)
X_tfidf = tfidf_vec.fit_transform(stem_token)
print(X_tfidf.shape)
data_input = pd.DataFrame(X_tfidf.toarray(), columns = tfidf_vec.get_feature_names_out())
data_input.head()

(12008, 3266)


Unnamed: 0,0th,10,100,100million,12,12year,1400,1st,1th,20,...,youg,young,younger,youngster,youself,youth,ypu,zebra,zenra,zhen
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
from sklearn.model_selection import train_test_split

X, X_test, y, y_test = train_test_split(data_input, data_output,
                                        test_size = 0.20, random_state = 2)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 2)

In [59]:
print(X_train.shape)
print(y_train.shape)
print()
print(X_val.shape)
print(y_val.shape)
print()
print(X_test.shape)
print(y_test.shape)

(7684, 3266)
(7684,)

(1922, 3266)
(1922,)

(2402, 3266)
(2402,)
