In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Sample data
data = {
    'text': [
        "I love this movie. It's fantastic!",
        "Horrible movie. Waste of time.",
        "Amazing film with great acting.",
        "Not worth watching. Boring plot.",
        "Incredible movie! Highly recommend."
    ],
    'label': ['positive', 'negative', 'positive', 'negative', 'positive']  # Example labels
}

# Create a DataFrame
df = pd.DataFrame(data)

# Define TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Transform the text data
X = vectorizer.fit_transform(df['text'])

# Target variable
y = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a classifier (e.g., Logistic Regression)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Predict labels for the test set
y_pred = classifier.predict(X_test)

# Evaluate the classifier
print(classification_report(y_test, y_pred))

# Example of classifying new text
new_texts = [
    "I found this movie quite entertaining.",
    "The film was a complete disaster."
]

# Transform new texts using the same vectorizer
X_new = vectorizer.transform(new_texts)

# Predict the labels for new texts
predictions = classifier.predict(X_new)
print(f"Predicted Labels: {predictions}")


              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         1
    positive       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

Predicted Labels: ['positive' 'positive']


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "I love programming in Python.",
    "Python is a great programming language.",
    "I love machine learning with Python."
]

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a dense format and print it
dense_matrix = tfidf_matrix.todense()
print("TF-IDF Matrix:")
print(dense_matrix)

print("\nFeature Names:")
print(feature_names)


TF-IDF Matrix:
[[0.         0.63174505 0.         0.         0.         0.4804584
  0.         0.4804584  0.37311881 0.        ]
 [0.50461134 0.         0.50461134 0.50461134 0.         0.
  0.         0.38376993 0.29803159 0.        ]
 [0.         0.         0.         0.         0.50461134 0.38376993
  0.50461134 0.         0.29803159 0.50461134]]

Feature Names:
['great' 'in' 'is' 'language' 'learning' 'love' 'machine' 'programming'
 'python' 'with']


In [None]:
import numpy as np
from collections import Counter
import math

documents = [
    "I love programming in Python.",
    "Python is a great programming language.",
    "I love machine learning with Python."
]
def tokenize(text):
    return text.lower().split()
def compute_document_frequency(docs):
    df = Counter()
    num_docs = len(docs)
    for doc in docs:
        tokens = set(tokenize(doc))
        for token in tokens:
            df[token] += 1
    return df, num_docs
def compute_idf(df, num_docs):
    idf = {}
    for token, freq in df.items():
        idf[token] = math.log(num_docs / (1 + freq))
    return idf
df, num_docs = compute_document_frequency(documents)
idf = compute_idf(df, num_docs)
print("IDF Values:")
for term, idf_value in idf.items():
    print(f"{term}: {idf_value}")


IDF Values:
i: 0.0
love: 0.0
in: 0.4054651081081644
python.: 0.0
programming: 0.0
great: 0.4054651081081644
is: 0.4054651081081644
python: 0.4054651081081644
a: 0.4054651081081644
language.: 0.4054651081081644
machine: 0.4054651081081644
learning: 0.4054651081081644
with: 0.4054651081081644


In [None]:
#code for TF
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Sample documents
documents = [
    "I love programming in Python.",
    "Python is a great programming language.",
    "I love machine learning with Python."
]

# Initialize the Count Vectorizer
vectorizer = CountVectorizer()

# Fit and transform the documents
tf_matrix = vectorizer.fit_transform(documents)

# Convert the TF matrix to a dense format
dense_matrix = tf_matrix.todense()

# Create a DataFrame to display the TF values
df_tf = pd.DataFrame(dense_matrix, columns=vectorizer.get_feature_names_out())

print("Term Frequency Matrix:")
print(df_tf)


Term Frequency Matrix:
   great  in  is  language  learning  love  machine  programming  python  with
0      0   1   0         0         0     1        0            1       1     0
1      1   0   1         1         0     0        0            1       1     0
2      0   0   0         0         1     1        1            0       1     1


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
documents = [
    "I love programming in Python.",
    "Python is a great programming language.",
    "I love machine learning with Python."
]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
dense_matrix = tfidf_matrix.todense()
df_tfidf = pd.DataFrame(dense_matrix, columns=vectorizer.get_feature_names_out())
print("TF-IDF Matrix:")
print(df_tfidf)


TF-IDF Matrix:
      great        in        is  language  learning      love   machine  \
0  0.000000  0.631745  0.000000  0.000000  0.000000  0.480458  0.000000   
1  0.504611  0.000000  0.504611  0.504611  0.000000  0.000000  0.000000   
2  0.000000  0.000000  0.000000  0.000000  0.504611  0.383770  0.504611   

   programming    python      with  
0     0.480458  0.373119  0.000000  
1     0.383770  0.298032  0.000000  
2     0.000000  0.298032  0.504611  
