In [44]:
import numpy as np
import pandas as pd

from datasets import load_dataset


## loading the dataset from huggingface and exploring it

In [45]:
ds = load_dataset('imdb')

In [46]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [47]:
ds['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [48]:
ds['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [49]:
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

# this dataset includes a text review from imdb, along with a probably tumbs up or down (mapped to labels 0-1)

In [50]:
ds_train.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [51]:
ds_train['label'].value_counts()

label
0    12500
1    12500
Name: count, dtype: int64

##Text Vectorization

###Initialize a TF-IDF Vectorizer to convert text data into a format suitable for machine learning models, specifying the maximum features and removing stop words for better performance.

**TF-IDF** is a numerical statistic that is used to indicate the importance of a word to a document in a collection or corpus. It is calculated by multiplying the number of times a word appears in a document (term frequency) by the logarithm of the inverse of the percentage of documents in the corpus that contain the word (inverse document frequency).

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

#vec = TfidfVectorizer()
#vec = TfidfVectorizer(max_features= 2000)
# removing stop words 
vec = TfidfVectorizer(max_features= 2000, stop_words='english')
vec.fit(ds_train['text'])
X_train = vec.transform(ds_train['text'])
# Convert the sparse matrix to a dense format
X_train.todense()


matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [53]:
X_train

<25000x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 1443094 stored elements in Compressed Sparse Row format>

## Prepare Test Data

### Transforming the test data using the same vectorizer used for the training data.

In [54]:
X_test = vec.transform(ds_test['text'])

### renaming Y_test and train

In [55]:
y_train = ds_train['label']
y_test = ds_test['label']

##Using Logistic rigression to see how everything is working

## both train and test are doing well 

In [56]:
from sklearn.linear_model import LogisticRegression
model= LogisticRegression()
model.fit(X_train,y_train)
model.score(X_train,y_train)

0.8928

In [57]:
X_test= vec.transform(ds_test['text'])
y_test = ds_test['label']
model.score(X_test, y_test)

0.87048

## changing ds to new_ds since we are only using test and train data and saving it

In [58]:
from datasets import Dataset, DatasetDict
# assign the splits
train = Dataset.from_pandas(ds_train)
test = Dataset.from_pandas(ds_test)
# reconstruct both datasets into a Dataset Dict object
new_ds = DatasetDict(
    {
        'train': train,
        'test': test
    }
)
# view the resulting dataset dict object
new_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [59]:
# Save the DatasetDict to disk
new_ds.save_to_disk("/Users/asalzooashkiany/Desktop/Data science/to/save/dataset")


Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]