# Datasets creation

## 1) Connect to Google Drive and import the useful libraries

In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
import pickle
from pprint import pprint
import collections
import numpy 
import matplotlib.pyplot as plt
import operator
import array
import pandas as pd


# Natural Language Tool Kit (NLTK) imports
import nltk
from nltk.data  import load
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

# Machine Learning Library (sklearn) imports
from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer , CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer



In [3]:
# Install nltk, Library suited for text mining and texts processing
import nltk
nltk.download('punkt') # containing the tokeinzers
import nltk
nltk.download('stopwords')  # Frequent or meaningless words of,an,a...

# Split a text into sentences
sentence_splitter = load('tokenizers/punkt/english.pickle')

# Split a sentence into words
tokenizer = TreebankWordTokenizer()

# Reduce to word to the root
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
punctuation = set([",", ".", ";", "/", ":", "-", "--" ,"!", "?", "(", ")","'",'"',"''", "``"])

# Set of common english words
stopwords_set = set(stopwords.words('english'))  # very common words in english
 

## 2) Texts processing functions

##### Creation of a function to preprocess texts

* A. split the text in sentences

* B. split the sentences into tokens

* C. delete punctuation and common-words from the tokens

* D. reduce token-words to the root

In [5]:
# Preprocess a given text
def preprocess_text(review):
    tokens = []
    # A. Split into sentences
    for sentence in sentence_splitter.tokenize(review):
        # B. Split into tokens
        for token in tokenizer.tokenize(sentence):
            token = token.lower()
            # C. Filter on stoplist and punctuation
            if token not in stopwords_set and token not in punctuation:
                # D. Stemming (takes root)
                stem = stemmer.stem(token)
                tokens.append(stem)
    return tokens

In [6]:
preprocess_text("This is how the algorithm works")

['algorithm', 'work']

## 3) Import of texts


In [8]:
# Import the set of positive book reviews
with open('/content/gdrive/My Drive/positive_text.review', 'r') as myfile: #positive reviwes about books
    pos_books = myfile.readlines()


# Import the set of negative book reviews
with open('/content/gdrive/My Drive/negative_text.review', 'r') as myfile: #negative reviwes about books
    neg_books = myfile.readlines()


In [9]:
print("pos_books length : " , len(pos_books) , "\n")

print("neg_books length : " , len(neg_books))

pos_books length :  1000 

neg_books length :  1000


In [10]:
pos_books[6]

" This the one book that taught me more about how to use my SLR than the camera's manual itself. The step by step approach and the logical arrangement of chapters makes it a book that really teaches you photography. I also liked the way Mr. Frost uses two photographs to illustrate a filter. One take without the filter and then the same one with a filter.  Also explained very well is how longer lenses reduce depth of the photograph and small helful tips on increasing your depth of field.  An encompassing book, the last few chapters tell about how a slide show can be made more interesting and how to take care of your equipment.  The peppering of photos and illustrations make you want to go out and take pictures using the methods shown. A good buy and money well spent. My only gripe is that I preffered the smaller size of the previous edition as it fit easily in my kit bag \n"

In [11]:
neg_books[2]

" THis book was horrible.  If it was possible to rate it lower than one star i would have.  I am an avid reader and picked this book up after my mom had gotten it from a friend.  I read half of it, suffering from a headache the entire time, and then got to the part about the relationship the 13 year old boy had with a 33 year old man and i lit this book on fire.  One less copy in the world...don't waste your money.  I wish i had the time spent reading this book back so i could use it for better purposes.  THis book wasted my life \n"

## 4) Creation of the TF-IDF dataset

### 4.1) Preprocess of texts and features elicitation

B_pos:

*Dictionary*   

**keys** = index $i$ of $i_{th}$ positive book review   ; 

**values** = list of tokens resulting from the processing of text $i_{th}$

</br>

B_neg:

*Dictionary*    

**keys** = index $i$ of $i_{th}$ negative book review   ;

**values** = list of tokens resulting from the processing of text $i_{th}$

</br>

feat:

*Dictionary*   

**keys** = counter  ; 

**values**= all the different tokens appearing in positive book reviews or negative book reviews


In [12]:

# Dictionary of texts
B_pos={}
B_neg={}

# All possible words
features=[]



for i in range(1000):
  # Preprocess the i-th text
  pre_pos=preprocess_text(pos_books[i].replace("'",' '))

  pre_neg=preprocess_text(neg_books[i].replace("'",' '))

  # Add the text to the dictionary
  B_pos[i]=pre_pos

  B_neg[i]=pre_neg

  # Add all the tokens to the list
  for j in pre_pos:
    features.append(j)

  for j in pre_neg:
    features.append(j)

    
    

# list to set 
features=set(features)


# set to dictionary
feat={}

k=0
# Each word in features is associated to an integer (index of columns)
for i in features:
   feat[i]=k
   k+=1


In [13]:
print("total number of different tokens present : " , len(features))

total number of different tokens present :  17093


### 4.2) TF-IDF evaluation

In [14]:
# A document containing thousands of documents to evaluate the inverse document frequency of english words
import nltk
nltk.download('brown')




from nltk.corpus import brown
# Group of English sentences
brown_corpus = brown.sents()


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [15]:
preprocessed_sentences = []

# create a list of preprocessed sentences from brown
for sentence in brown_corpus:
    clean_sentence = preprocess_text(' '.join(sentence))
    preprocessed_sentences.append(" ".join(clean_sentence))

In [16]:
# Vectorizer based on "english" document words
vectorizer = TfidfVectorizer(stop_words='english')
tf_idf = vectorizer.fit(preprocessed_sentences)
idf = vectorizer.idf_

# dictionary word->tfidf
idf = dict(zip(vectorizer.get_feature_names(), idf))

# sort the dict based on the tfidf value
sorted_idf = sorted(idf.items(), key=operator.itemgetter(1))



### 4.3) Dataset implementation

In [17]:
# Function that given the dict of all features and the dict of the tfifd of a text, return an array (representation of the text in features dimensions)
def assign_tfidf(features,tfidf):
  length=len(features)

 # Initialize a 0 array of length = length
  arr=array.array("f", (0 for i in range(length)))
  
 # All the possible dimensions 
  rigthnames=features.keys()

 # For each element in the tfidf dict 
  for el in tfidf:
     names=tfidf.keys()
     
     for name in names:
         # It should be always true but just to be sure
         if(name in rigthnames):

           # index= integer associated to the name in features dict
           index=features[name]

           # Tfidf associated to the name in tfidf dict
           value=tfidf[name]

           # update the value of arr
           arr[index]=value
       
  return(arr)

In [18]:
import warnings
warnings.filterwarnings("ignore") # .get_feature_name "deprecated"
dataset_pos=[]


for i in range(1000):
  
  # Book review document 
  book_doc = " ".join(B_pos[i])
  # Compute TF-IDF
  result = vectorizer.transform([book_doc])

  feature_names = tf_idf.get_feature_names()

  # tfidf as a dict  word - tfidf value
  tfidf = {}
  for col in result.nonzero()[1]:
      tfidf[feature_names[col]]=result[0, col]

  dataset_pos.append(assign_tfidf(feat,tfidf))
  


In [19]:
features=list(features)
features.append("type_text")

In [20]:
dataset_pos=pd.DataFrame(dataset_pos)
dataset_pos=dataset_pos.assign(type_text=[1 for i in range(1000)])
dataset_pos.columns=features
dataset_pos


Unnamed: 0,sidetrack,uniti,unprotect,fine-tun,-but,exasper,certif,detent,theater,cash-in,...,wife,psycho-drama,2.2.19-7.0.1,-plan,darwin,nake,not-so-divafi,non-tradit,easi,type_text
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191918,1


In [21]:
import warnings
warnings.filterwarnings("ignore") # .get_feature_name "deprecated"
dataset_neg=[]


for i in range(1000):
  
  # Book review document 
  book_doc = " ".join(B_neg[i])
  # Compute TF-IDF
  result = vectorizer.transform([book_doc])

  feature_names = tf_idf.get_feature_names()

  # tfidf as a dict  word - tfidf value
  tfidf = {}
  for col in result.nonzero()[1]:
      tfidf[feature_names[col]]=result[0, col]

  dataset_neg.append(assign_tfidf(feat,tfidf))
  


In [22]:
dataset_neg=pd.DataFrame(dataset_neg)
dataset_neg=dataset_neg.assign(type_text=[0 for i in range(1000)])
dataset_neg.columns=features
dataset_neg


Unnamed: 0,sidetrack,uniti,unprotect,fine-tun,-but,exasper,certif,detent,theater,cash-in,...,wife,psycho-drama,2.2.19-7.0.1,-plan,darwin,nake,not-so-divafi,non-tradit,easi,type_text
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [23]:
dataset_tfidf=pd.concat((dataset_neg , dataset_pos))
dataset_tfidf

Unnamed: 0,sidetrack,uniti,unprotect,fine-tun,-but,exasper,certif,detent,theater,cash-in,...,wife,psycho-drama,2.2.19-7.0.1,-plan,darwin,nake,not-so-divafi,non-tradit,easi,type_text
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191918,1


In [24]:
dataset_tfidf.to_csv(
    
path_or_buf="/content/gdrive/My Drive/Datasets/tfidf.csv",
sep=',',
)

## 5) Creation of TF dataset

dataset_pos_tf:

*list of dictionaries*

One dictionary per text: 

**keys**=tokens  ; 
 
**values**  =Token frequency in the text

In [25]:
dataset_pos_tf=[]



for i in range(1000):
# all different tokens of a single text
   features_to_consider=set(B_pos[i])
# list containing tuples   ("token" , frequency)   one list per document 
   dict_pos={}

   for token in features_to_consider:

     dict_pos[token]= B_pos[i].count(token) 

   dataset_pos_tf.append(dict_pos)

   






Use the TF-IDF dataset just to resume the structure of the Dataframe (features and rows)

In [26]:
dat=dataset_pos
dat

Unnamed: 0,sidetrack,uniti,unprotect,fine-tun,-but,exasper,certif,detent,theater,cash-in,...,wife,psycho-drama,2.2.19-7.0.1,-plan,darwin,nake,not-so-divafi,non-tradit,easi,type_text
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191918,1


Function to convert the TF-IDF dataset into a TF dataset

In [27]:
def convert_to_dataset(list_of_dict, dat):

 for i in range(1000):

    for word in list_of_dict[i].keys():

       dat.at[i,word]=list_of_dict[i][word]
       
 return dat


In [28]:
A=convert_to_dataset(dataset_pos_tf,dat)

In [29]:
A

Unnamed: 0,sidetrack,uniti,unprotect,fine-tun,-but,exasper,certif,detent,theater,cash-in,...,wife,psycho-drama,2.2.19-7.0.1,-plan,darwin,nake,not-so-divafi,non-tradit,easi,type_text
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1


In [30]:
dataset_neg_tf=[]



for i in range(1000):
# all different tokens of a single text
   features_to_consider=set(B_neg[i])
# list containing tuples   ("token" , frequency)   one list per document 
   dict_neg={}

   for token in features_to_consider:

     dict_neg[token]= B_neg[i].count(token) 

   dataset_neg_tf.append(dict_neg)


In [31]:
dat1=dataset_neg

In [32]:
B=convert_to_dataset(dataset_neg_tf,dat1)

In [33]:
B

Unnamed: 0,sidetrack,uniti,unprotect,fine-tun,-but,exasper,certif,detent,theater,cash-in,...,wife,psycho-drama,2.2.19-7.0.1,-plan,darwin,nake,not-so-divafi,non-tradit,easi,type_text
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [34]:
dataset_tf=pd.concat((A , B))
dataset_tf

Unnamed: 0,sidetrack,uniti,unprotect,fine-tun,-but,exasper,certif,detent,theater,cash-in,...,wife,psycho-drama,2.2.19-7.0.1,-plan,darwin,nake,not-so-divafi,non-tradit,easi,type_text
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [35]:
dataset_tf.to_csv(
    
path_or_buf="/content/gdrive/My Drive/Datasets/tf.csv",
sep=',',
)