In [42]:
#import libiraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import re
from xgboost import XGBClassifier
import pickle
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import PredefinedSplit
from sklearn.pipeline import Pipeline #Pipeline of transforms with a final estimator.
from sklearn.feature_extraction.text import TfidfVectorizer #to Convert text to a matrix of TF-IDF features.
from sklearn.linear_model import LogisticRegression #importing the logistic regression model
from bokeh.models import NumeralTickFormatter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from nltk.stem.snowball import SnowballStemmer#stemmer converts words to its root form
from nltk.tokenize import word_tokenize # we will be able to extract the tokens from string of characters
from nltk.corpus import stopwords # imported it to be able to remove stop words
import nltk #The Natural Language Toolkit which are libraries for NLP

In [2]:
#loading the training data
df = pd.read_csv('xy_train.csv')
df.head()

Unnamed: 0,id,text,label
0,265723,A group of friends began to volunteer at a hom...,0
1,284269,British Prime Minister @Theresa_May on Nerve A...,0
2,207715,"In 1961, Goodyear released a kit that allows P...",0
3,551106,"Happy Birthday, Bob Barker! The Price Is Right...",0
4,8584,"Obama to Nation: 聙""Innocent Cops and Unarmed Y...",0


In [3]:
#loading the testing data
df1 = pd.read_csv('x_test.csv')
df1.head()

Unnamed: 0,id,text
0,0,stargazer
1,1,yeah
2,2,PD: Phoenix car thief gets instructions from Y...
3,3,"As Trump Accuses Iran, He Has One Problem: His..."
4,4,"""Believers"" - Hezbollah 2011"


In [4]:
# check if the data has null values or not
df.isnull().sum()

id       0
text     0
label    0
dtype: int64

In [5]:
#info of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      60000 non-null  int64 
 1   text    60000 non-null  object
 2   label   60000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.4+ MB


In [6]:
df.shape

(60000, 3)

In [7]:
#frequency of each class
df['label'].value_counts()

0    32172
1    27596
2      232
Name: label, dtype: int64

In [8]:
#replace label 2 with 1
df.loc[df['label'] == 2, 'label'] = 1

In [9]:
df['label'].value_counts()

0    32172
1    27828
Name: label, dtype: int64

In [10]:
#Checking distribution of target variable values

df['label'].value_counts(normalize=True)

0    0.5362
1    0.4638
Name: label, dtype: float64

In [11]:
#check the duplicates in data
df.duplicated().sum()

0

In [12]:
nltk.download('punkt') #downloading the tokenizer which divides a text into a list of sentences from nltk
nltk.download('stopwords')#Stop words are words that are going to be ignored by tokenizers.

stemmer = SnowballStemmer("english") #stemmer converts words to its root form, for english language
stop_words = set(stopwords.words("english"))#to be able to remove stop words for english language

[nltk_data] Downloading package punkt to C:\Users\Ahmed
[nltk_data]     Mahmoud\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Ahmed
[nltk_data]     Mahmoud\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def clean_text(text, for_embedding=False): #defining a function for cleaning 
    """ steps:
        - remove any html tags (< /br> often found)
        - Keep only ASCII + European Chars and whitespace, no digits
        - remove single letter chars
        - convert all whitespaces (tabs etc.) to single wspace
        if not for embedding :
        - all lowercase
        - remove stopwords, punctuation and stemm
    """
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE) #for converting whitespacs to single white space
    RE_TAGS = re.compile(r"<[^>]+>") #for removing html tags
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE) #removing ASCII chars and european chars
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE) # removing single letter chars
    if for_embedding: #   if not for embedding :
        # - all lowercase
        # - remove stopwords, punctuation and stemm
        # Keep punctuation
        RE_ASCII = re.compile(r"[^A-Za-zÀ-ž,.!? ]", re.IGNORECASE)
        RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž,.!?]\b", re.IGNORECASE)

    text = re.sub(RE_TAGS, " ", text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_SINGLECHAR, " ", text)
    text = re.sub(RE_WSPACE, " ", text)

    word_tokens = word_tokenize(text) ## to extract the tokens from string of characters
    words_tokens_lower = [word.lower() for word in word_tokens]

    if for_embedding:
        # no stemming, lowering and punctuation / stop words removal
        words_filtered = word_tokens
    else:
        words_filtered = [
            stemmer.stem(word) for word in words_tokens_lower if word not in stop_words
        ]

    text_clean = " ".join(words_filtered)
    return text_clean

In [14]:
df['text_clean'] = df.loc[df['text'].str.len() > 5, 'text'] #defining a text_clean column in df 
df["text_clean"] = df["text"].map( #applying the text_clean function on the 'text' column and putting the results in the text_clean column 
    lambda x: clean_text(x, for_embedding=False) if isinstance(x, str) else x
)

In [15]:
df1["text_clean"] = df1.loc[df1["text"].str.len() > 5, "text"] #defining a text clean column in testing set
df1["text_clean"] = df1["text"].map(  #applying the text_clean function on the 'text' column and putting the results in the text_clean column
    lambda x: clean_text(x, for_embedding=False) if isinstance(x, str) else x
)    

In [16]:
df.head()

Unnamed: 0,id,text,label,text_clean
0,265723,A group of friends began to volunteer at a hom...,0,group friend began volunt homeless shelter nei...
1,284269,British Prime Minister @Theresa_May on Nerve A...,0,british prime minist theresa may nerv attack f...
2,207715,"In 1961, Goodyear released a kit that allows P...",0,goodyear releas kit allow ps brought heel http...
3,551106,"Happy Birthday, Bob Barker! The Price Is Right...",0,happi birthday bob barker price right host lik...
4,8584,"Obama to Nation: 聙""Innocent Cops and Unarmed Y...",0,obama nation innoc cop unarm young black men d...


In [17]:
df1.head()

Unnamed: 0,id,text,text_clean
0,0,stargazer,stargaz
1,1,yeah,yeah
2,2,PD: Phoenix car thief gets instructions from Y...,pd phoenix car thief get instruct youtub video
3,3,"As Trump Accuses Iran, He Has One Problem: His...",trump accus iran one problem credibl
4,4,"""Believers"" - Hezbollah 2011",believ hezbollah


In [18]:
# viewing Word Frequency of most common words
word_freq = pd.Series(" ".join(df["text_clean"]).split()).value_counts()
word_freq[0:20]

year      4139
one       3305
like      3144
new       3010
look      2862
color     2742
man       2737
get       2614
trump     2602
say       2360
peopl     2326
use       2312
first     2257
make      2241
old       2233
time      2038
found     2002
poster    2002
day       1941
war       1869
dtype: int64

In [19]:
#splitting the data into x and y for training data and x for testing data
X=df['text_clean']
Y=df['label']
X_test=df1['text_clean']

### Building the pipeline - TF-idfVectorizer("word")

In [20]:
from sklearn.pipeline import Pipeline #Pipeline of transforms with a final estimator.
from sklearn.feature_extraction.text import TfidfVectorizer #to Convert text to a matrix of TF-IDF features.
from sklearn.linear_model import LogisticRegression #importing the logistic regression model


pipe = Pipeline([("tfidf", TfidfVectorizer("word")),
                 ("lr", LogisticRegression())]) #we will use the word vectorizer and the logistic regrression model


params = {
    
    "tfidf__ngram_range": [(1, 4),(2,5)],#n-gram range:tuple (min_n, max_n), The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.
    "tfidf__max_df": np.arange(0.6, 0.8),#When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).
    "tfidf__min_df": np.arange(5,30),#When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. 
    'lr__penalty' : ['l1','l2'], #Penalized logistic regression imposes a penalty to the logistic model for having too many variables. This results in shrinking the coefficients of the less contributive variables toward zero. 
    'lr__C' : np.logspace(-3,3,30), #C value controls the strength of the penalty
    'lr__solver': [ 'liblinear','newton-cg', 'lbfgs'] #setting solver options/ranges to liblinear,newton-cg,lbfgs
}



### LR Model using RandomizedSearchCV

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, train_size = 0.8, stratify = Y, random_state = 45)#the random_state parameter is used for initializing the internal random number generator
#The stratify option instructs sklearn to divide the dataset into a test and training set with the ratio of class labels in the variable supplied.

# Create a list where train data indices are -1 and validation data indices are 0
# X_train2 (new training set), X_train
split_index = [-1 if x in X_train.index else 0 for x in X.index]

# Use the list to create PredefinedSplit
pds = PredefinedSplit(test_fold = split_index)
#
random_search_lr = RandomizedSearchCV(
    pipe, params, cv=pds, verbose=1, n_jobs=-1, 
    # number of random trials
    n_iter=10,
    scoring='roc_auc')#The degree of separability/distinction or intermingling/crossover between the forecasts of the two classes is shown by the ROC-AUC.

random_search_lr.fit(X,Y)  #fitting my data into random_search_lr results

print('best score {}'.format(random_search_lr.best_score_)) #printing best scores
print('best hyperparameter {}'.format(random_search_lr.best_params_)) #printing best hyperparameters

Fitting 1 folds for each of 10 candidates, totalling 10 fits


        nan 0.68790684        nan        nan]


best score 0.82303289399392
best hyperparameter {'tfidf__ngram_range': (1, 4), 'tfidf__min_df': 21, 'tfidf__max_df': 0.6, 'lr__solver': 'lbfgs', 'lr__penalty': 'l2', 'lr__C': 0.0041753189365604}


In [22]:
#predict output and save submission
submission = pd.DataFrame()
submission['id'] = df1['id']
submission['label']=random_search_lr.predict_proba(df1['text'])[:,1]
submission.to_csv('sample_submission_walkthrough1.csv', index=False)

### XGB Model using TfidfVectorizer("char") RandomizedSearchCV

In [23]:
pipe = Pipeline([("tfidf", TfidfVectorizer("char")), 
                 ('xgb_model', XGBClassifier())])

params = {
    
    "tfidf__ngram_range": [(1, 4),(2,5)], # #n-gram range:tuple (min_n, max_n), The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.
    "tfidf__max_df": np.arange(0.6, 0.8),#When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).
    "tfidf__min_df": np.arange(5,30),#When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. 
    'xgb_model__min_child_weight': [1, 5, 10], #The number of samples required to form a leaf node (the end of a branch)
    'xgb_model__gamma': [0.5, 1, 1.5, 2, 5],#The gamma is an unbounded parameter from 0 to infinity that is used to control the model’s tendency to overfit.
    'xgb_model__subsample': [0.6, 0.8, 1.0], # determines how much of the initial dataset is fair game for random sampling during each iteration of the boosting process.
    'xgb_model__colsample_bytree': [0.6, 0.8, 1.0],#defines percentage of features will be used for building each tree.
    'xgb_model__max_depth': [3, 4, 5]# how deep each estimator is permitted to build a tree.
}



In [24]:
random_search_xg = RandomizedSearchCV(
    pipe, params, cv=pds, verbose=1, n_jobs=5, 
    # number of random trials
    n_iter=10,
    scoring='roc_auc')

random_search_xg.fit(X,Y) #fitting my x and y into the xgboost random search rresults

print('best score {}'.format(random_search_xg.best_score_)) #printing best scores
print('best hyperparameter {}'.format(random_search_xg.best_params_)) #printing best hyperparameters

Fitting 1 folds for each of 10 candidates, totalling 10 fits
best score 0.8172086151643863
best hyperparameter {'xgb_model__subsample': 0.8, 'xgb_model__min_child_weight': 1, 'xgb_model__max_depth': 4, 'xgb_model__gamma': 1, 'xgb_model__colsample_bytree': 1.0, 'tfidf__ngram_range': (1, 4), 'tfidf__min_df': 27, 'tfidf__max_df': 0.6}


In [25]:
#predict output and save submission
submission = pd.DataFrame()
submission['id'] = df1['id']
submission['label']=random_search_xg.predict_proba(df1['text'])[:,1]
submission.to_csv('sample_submission_walkthrough2.csv', index=False)

### ### RandomForest Model using TfidfVectorizer("word") GridSearchCV

In [26]:
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([("tfidf", TfidfVectorizer("word")),  
                 ('my_classifier',RandomForestClassifier())])

params = {
    
    "tfidf__ngram_range": [(3, 6),(4,8)],#n-gram range:tuple (min_n, max_n), The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.
    "tfidf__max_df": np.arange(0.3, 0.9),#When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).
    "tfidf__min_df": np.arange(8,24),#When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.
    'my_classifier__n_estimators': [50, 100, 150],  # ranges of n_estimators which are the number of trees to be used in the forest.
    # I set my  n_estimators ranges to [50, 100, 150]
    'my_classifier__max_depth':[30, 60, 90]  
    # I set my  max depth ranges to [30, 60, 90] which are The number of splits that each decision tree is allowed to make
}



In [27]:
random_search_rnf = GridSearchCV(
    pipe, params, cv=pds, verbose=1, n_jobs=-1, 
    scoring='roc_auc')

random_search_rnf.fit(X,Y)

print('best score {}'.format(random_search_rnf.best_score_))
print('best hyperparameter {}'.format(random_search_rnf.best_params_))

Fitting 1 folds for each of 288 candidates, totalling 288 fits
best score 0.5259320962757253
best hyperparameter {'my_classifier__max_depth': 30, 'my_classifier__n_estimators': 100, 'tfidf__max_df': 0.3, 'tfidf__min_df': 8, 'tfidf__ngram_range': (3, 6)}


### LogisticRegression Model using TfidfVectorizer("word") GridSearchCV

In [28]:
# Further split the original training set to a train and a validation set
X_train, X_val, y_train, y_val = train_test_split(X, Y, train_size = 0.8, stratify = Y, random_state = 2022)
# Create a list where train data indices are -1 and validation data indices are 0
split_index = [-1 if i in X_train.index else 0 for i in X.index]
# Use the list to create PredefinedSplit
pds = PredefinedSplit(test_fold = split_index)

#set vectorizer hyperparameters
pipe = Pipeline([('cvec', TfidfVectorizer(preprocessor=clean_text,analyzer="word", max_df=0.3, min_df=10, norm="l2")),    
                 ('lr', LogisticRegression(solver='sag'))])
# Tune GridSearchCV
pipe_params = {'cvec__ngram_range': [(1,1), (2,2), (1,3)],'lr__C': [0.01, 0.1,1], 'lr__penalty': ['l1', 'l2']}
gs = GridSearchCV(pipe, param_grid=pipe_params,  scoring="roc_auc", cv=pds)
#train model with gridsearchcv
gs.fit(X, Y);
#predict best score and params
print("Train score", gs.score(X, Y))
print("Best params:", gs.best_params_)

Traceback (most recent call last):
  File "C:\Users\Ahmed Mahmoud\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Ahmed Mahmoud\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Ahmed Mahmoud\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Ahmed Mahmoud\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\Ahmed Mahmoud\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimat

Train score 0.9144142059191298
Best params: {'cvec__ngram_range': (1, 3), 'lr__C': 1, 'lr__penalty': 'l2'}


In [29]:
#predict output and save submission
submission = pd.DataFrame()
submission['id'] = df1['id']
submission['label']=gs.predict_proba(df1['text'])[:,1]
submission.to_csv('sample_submission_walkthrough2.csv', index=False)

### LogisticRegression Model using TfidfVectorizer("word") GridSearchCV

In [67]:
# Further split the original training set to a train and a validation set
X_train, X_val, y_train, y_val = train_test_split(X, Y, train_size = 0.8, stratify = Y, random_state = 2022)
# Create a list where train data indices are -1 and validation data indices are 0
split_index = [-1 if i in X_train.index else 0 for i in X.index]
# Use the list to create PredefinedSplit
pds = PredefinedSplit(test_fold = split_index)

#set vectorizer hyperparameters
pipe = Pipeline([('cvec', TfidfVectorizer(preprocessor=clean_text,analyzer="word", max_df=0.3, min_df=10, norm="l2")),    
                 ('lr', LogisticRegression(solver='sag'))])
# Tune GridSearchCV
pipe_params = {'cvec__ngram_range': [(1,1), (2,2), (1,3),(1, 4), (1, 5),(3, 6), (4, 8)],'lr__C': [0.01, 0.1,1], 'lr__penalty': ['l1', 'l2'], }
gs = GridSearchCV(pipe, param_grid=pipe_params,  scoring="roc_auc", cv=pds)
#train model with gridsearchcv
gs.fit(X, Y);
#predict best score and params
print("Train score", gs.score(X, Y))
print("Best params:", gs.best_params_)

Traceback (most recent call last):
  File "C:\Users\Ahmed Mahmoud\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Ahmed Mahmoud\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Ahmed Mahmoud\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Ahmed Mahmoud\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\Ahmed Mahmoud\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimat

Train score 0.9144144751079306
Best params: {'cvec__ngram_range': (1, 3), 'lr__C': 1, 'lr__penalty': 'l2'}


In [70]:
#predict output and save submission
submission = pd.DataFrame()
submission['id'] = df1['id']
submission['label']=gs.predict_proba(df1['text'])[:,1]
submission.to_csv('LRGS.csv', index=False)

### 2nd trial LogisticRegression Model using TfidfVectorizer("word") GridSearchCV

In [53]:
%%time
# feature creation and modelling in a single function
pipe = Pipeline([("tfidf", TfidfVectorizer('word')),
                 ("LR", LogisticRegression())])

# define parameter space to test # runtime 
params = {
    "tfidf__ngram_range": [(1, 4), (1, 5),(3, 6), (4, 8)],
    "tfidf__max_df": np.arange(0.3, 0.8),
    "tfidf__min_df": np.arange(50, 100),
     'LR__penalty' : ['l1', 'l2'],
    'LR__C' : [0.1,0.5],
    'LR__solver' : ['liblinear','sag','saga'],
    'LR__max_iter': [50,100]
}
# it is quite slow so we do 4 for now
pipe_clf = GridSearchCV(
    pipe, params, n_jobs=-1, scoring="roc_auc", cv=pds)
pipe_clf.fit(X, Y)
pickle.dump(pipe_clf, open("./clf_pipe.pck", "wb"))



Wall time: 55min 15s


In [54]:
best_params = pipe_clf.best_params_
print(best_params)

{'LR__C': 0.5, 'LR__max_iter': 50, 'LR__penalty': 'l2', 'LR__solver': 'saga', 'tfidf__max_df': 0.3, 'tfidf__min_df': 50, 'tfidf__ngram_range': (1, 5)}


In [55]:
# run pipe with optimized parameters
pipe.set_params(**best_params).fit(X, Y)
pipe_pred = pipe.predict(X_test)
report = sklearn.metrics.classification_report(y_test, pipe_pred)
print(report)

              precision    recall  f1-score   support

           0       0.81      0.80      0.80      6434
           1       0.77      0.79      0.78      5566

    accuracy                           0.79     12000
   macro avg       0.79      0.79      0.79     12000
weighted avg       0.79      0.79      0.79     12000



## Problem Formulation
### The Problem is:
We have data contains text column and each row include title and each title will be classed as fake or not. the problem is that the data contains contains various forms of words so we should apply text preprocessing techniques ti clean it.

### What is the input?
Text coulmn includes titles that will be classified as fake or not.

### What is the output?
Probability of how much is the title classified as fake.

### What data mining function is required?
Drop duplicate

Drop useless rows

re.sub

re.compile

re.IGNORECASE

word.lower()

value_counts

Steeming

Stop words

### What could be the challenges?
To remove operation signs from texts and stop words to make text more clear. To make computer understand human words, In a normal conversation between humans, things are often unsaid, whether in the form of some signal, expression, or just silence. Nevertheless, we, as humans, have the capacity to understand the underlying intent of the conversation, which a computer lacks. A second difficulty is owing to ambiguity in sentences. This may be at the word level, at the sentence level, or at the meaning level.

### What is the impact?
The model will truly understand human language and classify if news is fake or not from it's title only.

### What is an ideal solution?
Logistic regression model with word-level victorizer and GridSearchCV.

### What is the experimental protocol used and how was it carried out?
tf-idf Char level vectorizer and tf-idf Word level vectorizer. Both are good and each one of them is good with sort of data. We cannot decide which is better than other, It depends only on trained data.

### What preprocessing steps are used?
We dropped duplicate rows and clear data that includes false label.
We cleared stop words.
We cleared operations signs to make classifying easier.
Convert upper cases to lower.
Use tf-idf with word vectorizer and char vectorizer.
## Questions
### 🌈 What is the difference between Character n-gram and Word n-gram? Which one tends to suffer more from the OOV issue?
Character n-gram compute how much charachter repeated depends on the selected number, but word n-gram compute how much word repeated depends on selected number.

Character Tokenizers handles OOV words coherently by preserving the information of the word. It breaks down the OOV word into characters and represents the word in terms of these characters. It also limits the size of the vocabulary. since the 26 vocabulary contains a unique set of characters.

### 🌈 What is the difference between stop word removal and stemming? Are these techniques language-dependent?
Stop word removes words that not add much value to the meaning of the text.

like (“the”, “is”, “in”, “for”, “where”, “when”, “to”, “at” etc.).

Steeming is a text normalization technique that cuts off the end or beginning of a word by taking into account a list of common prefixes or suffixes that could be found in that word.

It is a rudimentary rule-based process of stripping the suffixes (“ing”, “ly”, “es”, “s” etc) from a word.

It is highly dependent on the task we are performing.

### 🌈 Is tokenization techniques language dependent? Why?
Yes, Tokenization is breaking the raw text into small chunks. Tokenization breaks the raw text into words, sentences called tokens. These tokens help in understanding the context or developing the model for the NLP. The tokenization helps in interpreting the meaning of the text by analyzing the sequence of the words.

### 🌈 What is the difference between count vectorizer and tf-idf vectorizer? Would it be feasible to use all possible n-grams? If not, how should you select them?
Count Vectorizer is a way to convert a given set of strings into a frequency representation.

TF-IDF stands for Term Frequency — Inverse Document Frequency and is a statistic that aims to better define how important a word is for a document, while also taking into account the relation to other documents from the same corpus.