## Model on GOvt and Non-Govt

In [3]:
import pandas as pd
df = pd.read_csv('annotated_news_articles.csv')
df.head()

Unnamed: 0,url,published_date,language,headline,description,label,author,annotate
0,https://www.ndtv.com/india-news/in-contrasting...,01/01/2023,English,"In Contrasting New Year Speeches, Putin And Ze...","Putin signalled once again, that the war, albe...",Non-Government,Reuters,Non-Government
1,https://www.ndtv.com/world-news/not-bound-by-d...,01/01/2023,English,"Israel Not Bound By ""Despicable"" UN Vote On Pa...",The Friday vote presents a challenge for Israe...,Government,Reuters,Non-Government
2,https://www.ndtv.com/world-news/after-aircraft...,01/01/2023,English,"After Aircraft Clash, China Accuses US Of Dist...",A US military plane involved in a confrontatio...,Government,Reuters,Non-Government
3,https://www.ndtv.com/world-news/chinese-author...,01/01/2023,English,"Chinese Authorities, State Media Seek To Reass...",Authorities have been trying to reassure the p...,Government,Reuters,Non-Government
4,https://www.ndtv.com/world-news/north-korea-fi...,01/01/2023,English,North Korea Fires 4th Ballistic Missile Within...,The launch follows an unprecedented number of ...,Non-Government,Reuters,Non-Government


In [6]:

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

X = df['description']
y = df['annotate']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train an SVM model
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")
print("Classification Report:\n", report)


Accuracy: 86.63829787234043
Classification Report:
                 precision    recall  f1-score   support

    Government       0.82      0.61      0.70       298
Non-Government       0.88      0.95      0.91       877

      accuracy                           0.87      1175
     macro avg       0.85      0.78      0.81      1175
  weighted avg       0.86      0.87      0.86      1175



In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming you have a DataFrame df with 'Text' column and 'Label' column
# Modify this according to your dataset

# Load your dataset
df = pd.read_csv('annotated_news_articles.csv')

X = df['description']
y = df['annotate']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You may need to adjust max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(kernel='linear', C=1.0),
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    model.fit(X_train_tfidf, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test_tfidf)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.2f}")

    # Print classification report for more details
    print(f"Classification Report for {model_name}:")
    print(classification_report(y_test, y_pred))



Training Random Forest...
Random Forest Accuracy: 0.91
Classification Report for Random Forest:
                precision    recall  f1-score   support

    Government       0.91      0.70      0.79       298
Non-Government       0.90      0.98      0.94       877

      accuracy                           0.91      1175
     macro avg       0.91      0.84      0.86      1175
  weighted avg       0.91      0.91      0.90      1175


Training SVM...
SVM Accuracy: 0.86
Classification Report for SVM:
                precision    recall  f1-score   support

    Government       0.81      0.61      0.69       298
Non-Government       0.88      0.95      0.91       877

      accuracy                           0.86      1175
     macro avg       0.84      0.78      0.80      1175
  weighted avg       0.86      0.86      0.86      1175


Training Logistic Regression...
Logistic Regression Accuracy: 0.84
Classification Report for Logistic Regression:
                precision    recall  f1-sco

In [14]:
from sklearn.tree import DecisionTreeClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You may need to adjust max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the Decision Tree model
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train_tfidf, y_train)

y_pred = decision_tree_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy: {accuracy:.2f}")

# Print classification report for more details
print("Classification Report for Decision Tree:")
print(classification_report(y_test, y_pred))

Decision Tree Accuracy: 0.88
Classification Report for Decision Tree:
                precision    recall  f1-score   support

    Government       0.77      0.78      0.77       298
Non-Government       0.92      0.92      0.92       877

      accuracy                           0.88      1175
     macro avg       0.84      0.85      0.85      1175
  weighted avg       0.88      0.88      0.88      1175



In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You may need to adjust max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = random_forest_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.2f}")

# Print classification report for more details
print("Classification Report for Random Forest:")
print(classification_report(y_test, y_pred))

Random Forest Accuracy: 0.90
Classification Report for Random Forest:
                precision    recall  f1-score   support

    Government       0.90      0.69      0.78       298
Non-Government       0.90      0.97      0.94       877

      accuracy                           0.90      1175
     macro avg       0.90      0.83      0.86      1175
  weighted avg       0.90      0.90      0.90      1175



## Best Result is Get from the Random Forest Mode

In [1]:
df2=pd.read_csv('my_annotate.csv')

In [18]:
df2.head()

Unnamed: 0,Sr. No.,News Link,Language,Description,Headline,Author,Date,Paragraph,Classifier
0,1,https://ndtv.in/india-news/gurugram-the-allege...,Hindi,गुरुग्राम में कथित गौ रक्षकों ने की सरेआम गुंड...,गुरुग्राम में कथित गौ रक्षकों ने की सरेआम गुंड...,"N/A, N/A","1 अगस्त, 2020 9:07 AM",['दिल्ली से सटे साइबर सिटी (Cyber \u200b\u200b...,Non-Government
1,2,https://www.ndtv.com/india-news/more-than-45-l...,English,More Than 45 Lakh People Affected In Bihar Floods,More Than 45 Lakh People Affected In Bihar Floods,,"August 01, 2020 12:36 am IST",['The number of distressed people has reached ...,Non-Government
2,3,https://www.ndtv.com/india-news/madhya-pradesh...,English,Woman Complains Of Filthy Toilets To Minister....,Woman Complains Of Filthy Toilets To Minister....,,"August 01, 2020 12:43 am IST",['Pradyuman Singh Tomar on Friday cleaned the ...,Government
3,4,https://www.ndtv.com/india-news/coronavirus-in...,English,India Part Of Gavi's New COVID-19 Global Vacci...,India Part Of Gavi's New COVID-19 Global Vacci...,,"August 01, 2020 12:48 am IST","[""Gavi hopes COVAX will enable countries to ha...",Non-Government
4,5,https://www.ndtv.com/world-news/us-teenager-ch...,English,US Teenager Charged In Twitter Hack Targeting ...,US Teenager Charged In Twitter Hack Targeting ...,,"August 01, 2020 2:18 am IST",['The attack targeted accounts of famous peopl...,Non-Government


In [21]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You may need to adjust max_features
para = tfidf_vectorizer.fit_transform(df2['Paragraph'])



In [22]:
predict = random_forest_model.predict(para)

In [24]:
accuracy = accuracy_score(df2['Classifier'], predict)
print(f"Random Forest Accuracy: {accuracy:.2f}")

Random Forest Accuracy: 0.67


In [30]:
df2=pd.read_csv('my_annotate.csv')
X=df2['Paragraph']
y=df2['Classifier']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You may need to adjust max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = random_forest_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.2f}")

# Print classification report for more details
print("Classification Report for Random Forest:")
print(classification_report(y_test, y_pred))

Random Forest Accuracy: 0.69
Classification Report for Random Forest:
                precision    recall  f1-score   support

    Government       0.84      0.27      0.41        78
Non-Government       0.67      0.97      0.79       122

      accuracy                           0.69       200
     macro avg       0.76      0.62      0.60       200
  weighted avg       0.74      0.69      0.64       200



In [32]:
pip install indic-nlp-library


Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
                                              0.0/40.3 kB ? eta -:--:--
     -------------------------------------- 40.3/40.3 kB 938.1 kB/s eta 0:00:00
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.4.0-py3-none-any.whl (12 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-2.0.0-py2.py3-none-any.whl (2.8 MB)
                                              0.0/2.8 MB ? eta -:--:--
     -                                        0.1/2.8 MB 3.2 MB/s eta 0:00:01
     --                                       0.2/2.8 MB 2.5 MB/s eta 0:00:02
     ---                                      0.3/2.8 MB 2.0 MB/s eta 0:00:02
     -----                                    0.4/2.8 MB 2.0 MB/s eta 0:00:02
     -----                                    0.4/2.8 MB 1.9 MB/s eta 0:00:02
     -------                                  0.5/2.8 MB 1.9 MB


[notice] A new release of pip is available: 23.1.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
pip install indic-nlp-library


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
pip install indic

Collecting indic
  Downloading indic-0.1.2.tar.gz (6.3 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: indic
  Building wheel for indic (setup.py): started
  Building wheel for indic (setup.py): finished with status 'done'
  Created wheel for indic: filename=indic-0.1.2-py3-none-any.whl size=10068 sha256=5fc747b50be422577d4597be974a97d70983730d570ad702b0bfae3c6336d548
  Stored in directory: c:\users\chaitanya\appdata\local\pip\cache\wheels\f3\78\eb\f089e00d15d844e38a3ba21c0c2c17ee526f2e0d3879a2b767
Successfully built indic
Installing collected packages: indic
Successfully installed indic-0.1.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chaitanya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading stopwords-hi: Package 'stopwords-hi' not
[nltk_data]     found in index


False

In [30]:
import pandas as pd

# Replace 'your_input_file.csv' and 'your_output_file.csv' with the actual file names
input_file = 'final_news_data_cleaned.csv'
output_file = 'news_dataa_alls1.csv'

# Try reading the CSV file with 'utf-8' encoding and errors='replace'
try:
    with open(input_file, 'r', encoding='utf-8') as file:
        df = pd.read_csv(file)
except UnicodeDecodeError:
    # If 'utf-8' encoding fails, try 'latin-1' encoding
    with open(input_file, 'r', encoding='latin-1') as file:
        df = pd.read_csv(file)

# Add a new column 'Sr. No.' with sequential numbers
df.insert(0, 'Sr. No.', range(1, 1 + len(df)))

# Save the modified DataFrame back to a new CSV file with 'utf-8-sig' encoding
df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Sequential 'Sr. No.' column added and saved to {output_file}")

Sequential 'Sr. No.' column added and saved to news_dataa_alls1.csv


In [31]:
df3=pd.read_csv('news_dataa_alls1.csv')

In [34]:
df3.drop(columns=['sr_no_dollar','link_number_dollar','author_dollar','paragraph_dollar'],inplace=True)

In [36]:
df3

Unnamed: 0,Sr. No.,news_link_dollar,language_dollar,description_dollar,headline_dollar,date_dollar,cleaned_paragraph
0,1,https://food.ndtv.com/hindi/rakhi-2020-this-ti...,Hindi,Rakhi 2020: इस बार रक्षाबंधन के मौके पर आसानी ...,Rakhi 2020: इस बार रक्षाबंधन के मौके पर आसानी ...,"1 अगस्त, 2020 7:02 PM",Rakshabandhan 2020: इस आसान घेवर रेसिपी को घर ...
1,2,https://ndtv.in/india-news/digvijay-singh-rahu...,Hindi,राहुल गांधी को दिग्व‍िजय सिंह की सलाह पर भड़के...,राहुल गांधी को दिग्व‍िजय सिंह की सलाह पर भड़के...,"2 अगस्त, 2020 5:33 PM",वरिष्ठ कांग्रेस नेता दिग्विजय सिंह. (Digvijaya...
2,3,https://ndtv.in/bollywood/hina-khan-masti-danc...,Hindi,"हिना खान का ईद के मौके पर Video हुआ वायरल, ट्र...","हिना खान का ईद के मौके पर Video हुआ वायरल, ट्र...","2 अगस्त, 2020 8:09 AM",हिना खान (Hina Khan) का वीडियो हुआ वायरल एक्ट्...
3,4,https://ndtv.in/india-news/jammu-and-kashmir-l...,Hindi,जम्मू-कश्मीर प्रशासन ने एक साल में हासिल कीं त...,जम्मू-कश्मीर प्रशासन ने एक साल में हासिल कीं त...,"2 अगस्त, 2020 9:37 PM",प्रतीकात्मक तस्वीर केंद्रीय गृह मंत्रालय (MHA)...
4,5,https://ndtv.in/lifestyle/multani-mitti-benefi...,Hindi,"Multani Mitti Benefits: जानें, त्वचा के लिए क्...","Multani Mitti Benefits: जानें, त्वचा के लिए क्...","5 जनवरी, 2021 3:31 PM","Multani Mitti Benefits: जानें, त्वचा के लिए क्..."
...,...,...,...,...,...,...,...
20953,20954,https://sports.ndtv.com/australia-vs-india-202...,English,"Australia vs India Live Score Ball by Ball, Au...",,"November 29, 2020 01:58 PM IST",Follow the Australia vs India 2020-21 live cri...
20954,20955,https://sports.ndtv.com/cricket/2nd-odi-kl-rah...,English,"Australia vs India: ""Did Not Adapt Quick Enoug...",,"November 29, 2020 08:11 PM IST","Backing his struggling bowlers to the hilt, In..."
20955,20956,https://sports.ndtv.com/cricket/england-vs-aus...,English,1st T20I: England Beat Australia In Last-Ball ...,,"September 05, 2020 08:36 AM IST",Tom Curran held his nerve as England beat Aust...
20956,20957,https://sports.ndtv.com/formula-1/formula-1-re...,English,Formula 1: Renault To Race Under Alpine Name F...,,"September 06, 2020 06:32 PM IST",Renault will change the name of their Formula ...


In [38]:
df4=pd.read_csv('final_news_data_cleaned.csv')
df3

Unnamed: 0,Sr. No.,news_link_dollar,language_dollar,description_dollar,headline_dollar,date_dollar,cleaned_paragraph
0,1,https://food.ndtv.com/hindi/rakhi-2020-this-ti...,Hindi,Rakhi 2020: इस बार रक्षाबंधन के मौके पर आसानी ...,Rakhi 2020: इस बार रक्षाबंधन के मौके पर आसानी ...,"1 अगस्त, 2020 7:02 PM",Rakshabandhan 2020: इस आसान घेवर रेसिपी को घर ...
1,2,https://ndtv.in/india-news/digvijay-singh-rahu...,Hindi,राहुल गांधी को दिग्व‍िजय सिंह की सलाह पर भड़के...,राहुल गांधी को दिग्व‍िजय सिंह की सलाह पर भड़के...,"2 अगस्त, 2020 5:33 PM",वरिष्ठ कांग्रेस नेता दिग्विजय सिंह. (Digvijaya...
2,3,https://ndtv.in/bollywood/hina-khan-masti-danc...,Hindi,"हिना खान का ईद के मौके पर Video हुआ वायरल, ट्र...","हिना खान का ईद के मौके पर Video हुआ वायरल, ट्र...","2 अगस्त, 2020 8:09 AM",हिना खान (Hina Khan) का वीडियो हुआ वायरल एक्ट्...
3,4,https://ndtv.in/india-news/jammu-and-kashmir-l...,Hindi,जम्मू-कश्मीर प्रशासन ने एक साल में हासिल कीं त...,जम्मू-कश्मीर प्रशासन ने एक साल में हासिल कीं त...,"2 अगस्त, 2020 9:37 PM",प्रतीकात्मक तस्वीर केंद्रीय गृह मंत्रालय (MHA)...
4,5,https://ndtv.in/lifestyle/multani-mitti-benefi...,Hindi,"Multani Mitti Benefits: जानें, त्वचा के लिए क्...","Multani Mitti Benefits: जानें, त्वचा के लिए क्...","5 जनवरी, 2021 3:31 PM","Multani Mitti Benefits: जानें, त्वचा के लिए क्..."
...,...,...,...,...,...,...,...
20953,20954,https://sports.ndtv.com/australia-vs-india-202...,English,"Australia vs India Live Score Ball by Ball, Au...",,"November 29, 2020 01:58 PM IST",Follow the Australia vs India 2020-21 live cri...
20954,20955,https://sports.ndtv.com/cricket/2nd-odi-kl-rah...,English,"Australia vs India: ""Did Not Adapt Quick Enoug...",,"November 29, 2020 08:11 PM IST","Backing his struggling bowlers to the hilt, In..."
20955,20956,https://sports.ndtv.com/cricket/england-vs-aus...,English,1st T20I: England Beat Australia In Last-Ball ...,,"September 05, 2020 08:36 AM IST",Tom Curran held his nerve as England beat Aust...
20956,20957,https://sports.ndtv.com/formula-1/formula-1-re...,English,Formula 1: Renault To Race Under Alpine Name F...,,"September 06, 2020 06:32 PM IST",Renault will change the name of their Formula ...


In [40]:
df = pd.read_csv('annotated_news_articles.csv')

X = df['description']
y = df['annotate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You may need to adjust max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = random_forest_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.2f}")

# Print classification report for more details
print("Classification Report for Random Forest:")
print(classification_report(y_test, y_pred))

Random Forest Accuracy: 0.90
Classification Report for Random Forest:
                precision    recall  f1-score   support

    Government       0.90      0.69      0.78       298
Non-Government       0.90      0.97      0.94       877

      accuracy                           0.90      1175
     macro avg       0.90      0.83      0.86      1175
  weighted avg       0.90      0.90      0.90      1175



In [42]:
new_paragraphs_tfidf = tfidf_vectorizer.transform(df3['cleaned_paragraph'])

# Make predictions on the new dataset
new_predictions = random_forest_model.predict(new_paragraphs_tfidf)

# Add the predictions to the new DataFrame as a new column 'Classifier'
df3['Classifier'] = new_predictions

In [43]:
df3

Unnamed: 0,Sr. No.,news_link_dollar,language_dollar,description_dollar,headline_dollar,date_dollar,cleaned_paragraph,Classifier
0,1,https://food.ndtv.com/hindi/rakhi-2020-this-ti...,Hindi,Rakhi 2020: इस बार रक्षाबंधन के मौके पर आसानी ...,Rakhi 2020: इस बार रक्षाबंधन के मौके पर आसानी ...,"1 अगस्त, 2020 7:02 PM",Rakshabandhan 2020: इस आसान घेवर रेसिपी को घर ...,Non-Government
1,2,https://ndtv.in/india-news/digvijay-singh-rahu...,Hindi,राहुल गांधी को दिग्व‍िजय सिंह की सलाह पर भड़के...,राहुल गांधी को दिग्व‍िजय सिंह की सलाह पर भड़के...,"2 अगस्त, 2020 5:33 PM",वरिष्ठ कांग्रेस नेता दिग्विजय सिंह. (Digvijaya...,Non-Government
2,3,https://ndtv.in/bollywood/hina-khan-masti-danc...,Hindi,"हिना खान का ईद के मौके पर Video हुआ वायरल, ट्र...","हिना खान का ईद के मौके पर Video हुआ वायरल, ट्र...","2 अगस्त, 2020 8:09 AM",हिना खान (Hina Khan) का वीडियो हुआ वायरल एक्ट्...,Non-Government
3,4,https://ndtv.in/india-news/jammu-and-kashmir-l...,Hindi,जम्मू-कश्मीर प्रशासन ने एक साल में हासिल कीं त...,जम्मू-कश्मीर प्रशासन ने एक साल में हासिल कीं त...,"2 अगस्त, 2020 9:37 PM",प्रतीकात्मक तस्वीर केंद्रीय गृह मंत्रालय (MHA)...,Non-Government
4,5,https://ndtv.in/lifestyle/multani-mitti-benefi...,Hindi,"Multani Mitti Benefits: जानें, त्वचा के लिए क्...","Multani Mitti Benefits: जानें, त्वचा के लिए क्...","5 जनवरी, 2021 3:31 PM","Multani Mitti Benefits: जानें, त्वचा के लिए क्...",Non-Government
...,...,...,...,...,...,...,...,...
20953,20954,https://sports.ndtv.com/australia-vs-india-202...,English,"Australia vs India Live Score Ball by Ball, Au...",,"November 29, 2020 01:58 PM IST",Follow the Australia vs India 2020-21 live cri...,Non-Government
20954,20955,https://sports.ndtv.com/cricket/2nd-odi-kl-rah...,English,"Australia vs India: ""Did Not Adapt Quick Enoug...",,"November 29, 2020 08:11 PM IST","Backing his struggling bowlers to the hilt, In...",Non-Government
20955,20956,https://sports.ndtv.com/cricket/england-vs-aus...,English,1st T20I: England Beat Australia In Last-Ball ...,,"September 05, 2020 08:36 AM IST",Tom Curran held his nerve as England beat Aust...,Non-Government
20956,20957,https://sports.ndtv.com/formula-1/formula-1-re...,English,Formula 1: Renault To Race Under Alpine Name F...,,"September 06, 2020 06:32 PM IST",Renault will change the name of their Formula ...,Non-Government


In [44]:
df3.head()

Unnamed: 0,Sr. No.,news_link_dollar,language_dollar,description_dollar,headline_dollar,date_dollar,cleaned_paragraph,Classifier
0,1,https://food.ndtv.com/hindi/rakhi-2020-this-ti...,Hindi,Rakhi 2020: इस बार रक्षाबंधन के मौके पर आसानी ...,Rakhi 2020: इस बार रक्षाबंधन के मौके पर आसानी ...,"1 अगस्त, 2020 7:02 PM",Rakshabandhan 2020: इस आसान घेवर रेसिपी को घर ...,Non-Government
1,2,https://ndtv.in/india-news/digvijay-singh-rahu...,Hindi,राहुल गांधी को दिग्व‍िजय सिंह की सलाह पर भड़के...,राहुल गांधी को दिग्व‍िजय सिंह की सलाह पर भड़के...,"2 अगस्त, 2020 5:33 PM",वरिष्ठ कांग्रेस नेता दिग्विजय सिंह. (Digvijaya...,Non-Government
2,3,https://ndtv.in/bollywood/hina-khan-masti-danc...,Hindi,"हिना खान का ईद के मौके पर Video हुआ वायरल, ट्र...","हिना खान का ईद के मौके पर Video हुआ वायरल, ट्र...","2 अगस्त, 2020 8:09 AM",हिना खान (Hina Khan) का वीडियो हुआ वायरल एक्ट्...,Non-Government
3,4,https://ndtv.in/india-news/jammu-and-kashmir-l...,Hindi,जम्मू-कश्मीर प्रशासन ने एक साल में हासिल कीं त...,जम्मू-कश्मीर प्रशासन ने एक साल में हासिल कीं त...,"2 अगस्त, 2020 9:37 PM",प्रतीकात्मक तस्वीर केंद्रीय गृह मंत्रालय (MHA)...,Non-Government
4,5,https://ndtv.in/lifestyle/multani-mitti-benefi...,Hindi,"Multani Mitti Benefits: जानें, त्वचा के लिए क्...","Multani Mitti Benefits: जानें, त्वचा के लिए क्...","5 जनवरी, 2021 3:31 PM","Multani Mitti Benefits: जानें, त्वचा के लिए क्...",Non-Government


In [60]:
df5 = pd.read_csv('cleaned_my_annotate_v2.csv')

In [61]:
df5.head()

Unnamed: 0,Sr. No.,News Link,Language,Description,Headline,Author,Date,Paragraph,Classifier
0,1,https://ndtv.in/india-news/gurugram-the-allege...,Hindi,गुरुग्राम में कथित गौ रक्षकों ने की सरेआम गुंड...,गुरुग्राम में कथित गौ रक्षकों ने की सरेआम गुंड...,"N/A, N/A","1 अगस्त, 2020 9:07 AM",['दिल्ली से सटे साइबर सिटी (Cyber u200bu200bci...,Non-Government
1,2,https://www.ndtv.com/india-news/more-than-45-l...,English,More Than 45 Lakh People Affected In Bihar Floods,More Than 45 Lakh People Affected In Bihar Floods,,"August 01, 2020 12:36 am IST",['The number of distressed people has reached ...,Non-Government
2,3,https://www.ndtv.com/india-news/madhya-pradesh...,English,Woman Complains Of Filthy Toilets To Minister....,Woman Complains Of Filthy Toilets To Minister....,,"August 01, 2020 12:43 am IST",['Pradyuman Singh Tomar on Friday cleaned the ...,Government
3,4,https://www.ndtv.com/india-news/coronavirus-in...,English,India Part Of Gavi's New COVID-19 Global Vacci...,India Part Of Gavi's New COVID-19 Global Vacci...,,"August 01, 2020 12:48 am IST","[""Gavi hopes COVAX will enable countries to ha...",Non-Government
4,5,https://www.ndtv.com/world-news/us-teenager-ch...,English,US Teenager Charged In Twitter Hack Targeting ...,US Teenager Charged In Twitter Hack Targeting ...,,"August 01, 2020 2:18 am IST",['The attack targeted accounts of famous peopl...,Non-Government


In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
X = df5['Paragraph']
y = df5['Classifier']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You may need to adjust max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(kernel='linear', C=1.0),
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    model.fit(X_train_tfidf, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test_tfidf)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.2f}")

    # Print classification report for more details
    print(f"Classification Report for {model_name}:")
    print(classification_report(y_test, y_pred))


Training Random Forest...
Random Forest Accuracy: 0.69
Classification Report for Random Forest:
                precision    recall  f1-score   support

    Government       0.78      0.27      0.40        78
Non-Government       0.67      0.95      0.79       122

      accuracy                           0.69       200
     macro avg       0.72      0.61      0.59       200
  weighted avg       0.71      0.69      0.64       200


Training SVM...
SVM Accuracy: 0.70
Classification Report for SVM:
                precision    recall  f1-score   support

    Government       0.72      0.37      0.49        78
Non-Government       0.69      0.91      0.79       122

      accuracy                           0.70       200
     macro avg       0.71      0.64      0.64       200
  weighted avg       0.71      0.70      0.67       200


Training Logistic Regression...
Logistic Regression Accuracy: 0.67
Classification Report for Logistic Regression:
                precision    recall  f1-sco

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

    Government       0.00      0.00      0.00        78
Non-Government       0.61      1.00      0.76       122

      accuracy                           0.61       200
     macro avg       0.30      0.50      0.38       200
  weighted avg       0.37      0.61      0.46       200


Training K-Nearest Neighbors...
K-Nearest Neighbors Accuracy: 0.71
Classification Report for K-Nearest Neighbors:
                precision    recall  f1-score   support

    Government       0.70      0.45      0.55        78
Non-Government       0.71      0.88      0.79       122

      accuracy                           0.71       200
     macro avg       0.71      0.66      0.67       200
  weighted avg       0.71      0.71      0.69       200



In [55]:
df = pd.read_csv('annotated_news_articles.csv')

X = df['description']
y = df['annotate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You may need to adjust max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = random_forest_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.2f}")

# Print classification report for more details
print("Classification Report for Random Forest:")
print(classification_report(y_test, y_pred))

Random Forest Accuracy: 0.90
Classification Report for Random Forest:
                precision    recall  f1-score   support

    Government       0.90      0.69      0.78       298
Non-Government       0.90      0.97      0.94       877

      accuracy                           0.90      1175
     macro avg       0.90      0.83      0.86      1175
  weighted avg       0.90      0.90      0.90      1175



In [56]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You may need to adjust max_features
tfidf = tfidf_vectorizer.fit_transform(df)

y_pred = random_forest_model.predict(df5['Paragraph'])



ValueError: could not convert string to float: "['दिल्ली से सटे साइबर सिटी (Cyber u200bu200bcity) गुरुग्राम में गुंडागर्दी की ऐसी तस्वीरें सामने आई हैं जिसे देखकर हर किसी का कालेजा कांप जाएगा.xa0गुरुग्राम में शुक्रवार सुबह करीब 9 बजे मीट से भरी एक पिकअप गाड़ी को कई किलोमीटर पीछा कर कुछ कथित गौ रक्षकों ने पकड़ लिया और फिर चालक को नीचे उतार कर हथोड़े से पीटना शुरु कर दिया.xa0xa0इस घटना का किसी ने मोबाइल में वीडियो बना लिया. जिसमें साफ दिख रहा है कि कथित गौ रक्षक कितनी बेरहमी से गाड़ी चालक को बीच सडक गिरा कर हथोड़े से पीट रहे हैं. इतना ही नहीं ये सब गुरुग्राम पुलिसxa0के जवानों के सामने और दर्जनों लोगों के सामने बीच सडक हो रहा था लेकिन किसी ने भी उसको बचाने की जहमत नहीं उठाई.', 'कथित गौ रक्षकों ने पहले तो बादशाहपुर कस्बे से पिकअप गाड़ी का करीब 8 किलोमीटर तक पीछा किया और गुरुग्राम की जुम्मा मस्जिद के पास पकड़ लिया जिसके बाद मस्जिद के पास ही चालक को बेहरमी से पीटते रहे.xa0xa0पिकअप चालक लुकमान को अधमरा करने के बाद कथित गौ रक्षक उसको उसी की गाड़ी में डालकर अगवा कर ले गए और वापिस बादशाहपुर ले जाकर पीटने लगे इतने में बादशाहपुर पुलिस थाने की पुलिस आई जिसके बाद पुलिस ने लुकमान को छुड़वा कर पुलिस वैन में बिठा लिया तो कथित गौ रक्षक पुलिस से ही उलझ गए. घटना की जानकारी मिलने के बाद सोहना से बीजेपी के विधायक संजय सिंह भी मौके पर पहुंचे और घायल को सड़क पर पड़े देखते रहे लेकिन किसी ने भी उसको अस्पताल पहुंचाने की जहमत तक नहीं उठाई.', 'पुलिस ने घायल को अस्पताल पहुंचाया जहां उसका इलाज चल रहा है. पुलिस ने घायल लुकमान के बयानों के आधार पर अज्ञात के खिलाफ कई धाराओं में केस दर्ज कर लिया है लेकिन अभी तक किसी की गिरफ्तारी नहीं की है.xa0xa0गाड़ी के मालिक का दावा है कि वो पिछले 50 सालों से मीट का कारोबार करते हैं और इस गाड़ी में भैंस का मीट लाया जा रहा था.xa0xa0पुलिस ने कथित गौ रक्षकों के खिलाफ केस दर्ज कर मीट का सैंपल जांच के लिए लैब में भेज दिया है.', 'झारखंड : बकरी चोरी के आरोप में दो युवकों को बांधकर भीड़ ने पीटा, एक की मौत']"

In [66]:
df5['Joined_Paragraph'] = df5['Paragraph'].apply(lambda x: ''.join(x) if x is not None else '')

# Save the updated DataFrame to a new CSV file
new_csv_filename = 'gnc.csv'
df5.to_csv(new_csv_filename, index=False)

print(f"Updated DataFrame with joined paragraphs has been saved to {new_csv_filename}")

Updated DataFrame with joined paragraphs has been saved to gnc.csv


In [67]:
df6=pd.read_csv('gnc.csv')
df6.head()

Unnamed: 0,Sr. No.,News Link,Language,Description,Headline,Author,Date,Paragraph,Classifier,Joined_Paragraph
0,1,https://ndtv.in/india-news/gurugram-the-allege...,Hindi,गुरुग्राम में कथित गौ रक्षकों ने की सरेआम गुंड...,गुरुग्राम में कथित गौ रक्षकों ने की सरेआम गुंड...,"N/A, N/A","1 अगस्त, 2020 9:07 AM",['दिल्ली से सटे साइबर सिटी (Cyber u200bu200bci...,Non-Government,['दिल्ली से सटे साइबर सिटी (Cyber u200bu200bci...
1,2,https://www.ndtv.com/india-news/more-than-45-l...,English,More Than 45 Lakh People Affected In Bihar Floods,More Than 45 Lakh People Affected In Bihar Floods,,"August 01, 2020 12:36 am IST",['The number of distressed people has reached ...,Non-Government,['The number of distressed people has reached ...
2,3,https://www.ndtv.com/india-news/madhya-pradesh...,English,Woman Complains Of Filthy Toilets To Minister....,Woman Complains Of Filthy Toilets To Minister....,,"August 01, 2020 12:43 am IST",['Pradyuman Singh Tomar on Friday cleaned the ...,Government,['Pradyuman Singh Tomar on Friday cleaned the ...
3,4,https://www.ndtv.com/india-news/coronavirus-in...,English,India Part Of Gavi's New COVID-19 Global Vacci...,India Part Of Gavi's New COVID-19 Global Vacci...,,"August 01, 2020 12:48 am IST","[""Gavi hopes COVAX will enable countries to ha...",Non-Government,"[""Gavi hopes COVAX will enable countries to ha..."
4,5,https://www.ndtv.com/world-news/us-teenager-ch...,English,US Teenager Charged In Twitter Hack Targeting ...,US Teenager Charged In Twitter Hack Targeting ...,,"August 01, 2020 2:18 am IST",['The attack targeted accounts of famous peopl...,Non-Government,['The attack targeted accounts of famous peopl...


In [68]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You may need to adjust max_features
tfidf = tfidf_vectorizer.fit_transform(df)
y_pred = random_forest_model.predict(df6['Paragraph'])

ValueError: could not convert string to float: "['दिल्ली से सटे साइबर सिटी (Cyber u200bu200bcity) गुरुग्राम में गुंडागर्दी की ऐसी तस्वीरें सामने आई हैं जिसे देखकर हर किसी का कालेजा कांप जाएगा.xa0गुरुग्राम में शुक्रवार सुबह करीब 9 बजे मीट से भरी एक पिकअप गाड़ी को कई किलोमीटर पीछा कर कुछ कथित गौ रक्षकों ने पकड़ लिया और फिर चालक को नीचे उतार कर हथोड़े से पीटना शुरु कर दिया.xa0xa0इस घटना का किसी ने मोबाइल में वीडियो बना लिया. जिसमें साफ दिख रहा है कि कथित गौ रक्षक कितनी बेरहमी से गाड़ी चालक को बीच सडक गिरा कर हथोड़े से पीट रहे हैं. इतना ही नहीं ये सब गुरुग्राम पुलिसxa0के जवानों के सामने और दर्जनों लोगों के सामने बीच सडक हो रहा था लेकिन किसी ने भी उसको बचाने की जहमत नहीं उठाई.', 'कथित गौ रक्षकों ने पहले तो बादशाहपुर कस्बे से पिकअप गाड़ी का करीब 8 किलोमीटर तक पीछा किया और गुरुग्राम की जुम्मा मस्जिद के पास पकड़ लिया जिसके बाद मस्जिद के पास ही चालक को बेहरमी से पीटते रहे.xa0xa0पिकअप चालक लुकमान को अधमरा करने के बाद कथित गौ रक्षक उसको उसी की गाड़ी में डालकर अगवा कर ले गए और वापिस बादशाहपुर ले जाकर पीटने लगे इतने में बादशाहपुर पुलिस थाने की पुलिस आई जिसके बाद पुलिस ने लुकमान को छुड़वा कर पुलिस वैन में बिठा लिया तो कथित गौ रक्षक पुलिस से ही उलझ गए. घटना की जानकारी मिलने के बाद सोहना से बीजेपी के विधायक संजय सिंह भी मौके पर पहुंचे और घायल को सड़क पर पड़े देखते रहे लेकिन किसी ने भी उसको अस्पताल पहुंचाने की जहमत तक नहीं उठाई.', 'पुलिस ने घायल को अस्पताल पहुंचाया जहां उसका इलाज चल रहा है. पुलिस ने घायल लुकमान के बयानों के आधार पर अज्ञात के खिलाफ कई धाराओं में केस दर्ज कर लिया है लेकिन अभी तक किसी की गिरफ्तारी नहीं की है.xa0xa0गाड़ी के मालिक का दावा है कि वो पिछले 50 सालों से मीट का कारोबार करते हैं और इस गाड़ी में भैंस का मीट लाया जा रहा था.xa0xa0पुलिस ने कथित गौ रक्षकों के खिलाफ केस दर्ज कर मीट का सैंपल जांच के लिए लैब में भेज दिया है.', 'झारखंड : बकरी चोरी के आरोप में दो युवकों को बांधकर भीड़ ने पीटा, एक की मौत']"

In [7]:
import pandas as pd
df=pd.read_csv('my_annotate.csv');

hindi_texts = df[df['Classifier'] == 'Government']

num_hindi_texts = hindi_texts.shape[0]

print(f"Number of Govt text entries: {num_hindi_texts}")

Number of Govt text entries: 321
